# Zadanie domowe - k-means
### Jędrzej Górski 148128
### Maciej Wieczorek 148141

In [64]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [65]:
np.random.seed(508346404)

In [66]:
class KMeansHW:
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.labels_ = np.array([])
        self.cluster_centers_ = np.array([])
        self.max_iters = 100
        self.epsilon = 0.0001

    def label(self, data):
        distances = []
        for centroid in self.cluster_centers_:
            distances.append(np.linalg.norm(data - centroid, axis=1))
        distances = np.vstack(distances)
        self.labels_ = np.argmin(distances, axis=0)

    def fit(self, data):
        self.cluster_centers_ = data[np.random.choice(data.shape[0], size=self.n_clusters)] 
        self.label(data)
        for _ in range(self.max_iters):
            attr_max_changes = []
            for i in range(self.n_clusters):
                points = data[np.where(self.labels_ == i)]
                new_centroid = np.mean(points, axis=0)
                attr_max_changes.append(max(abs(self.cluster_centers_[i] - new_centroid)))
                self.cluster_centers_[i] = new_centroid
            self.label(data)

            if max(attr_max_changes) < self.epsilon:
                break
        
        return self

In [67]:
cereal_data = pd.read_csv('cereal.csv')
cereal_data.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [68]:
cereal_data_prep = cereal_data.drop(columns=['name', 'mfr'])
cereal_data_prep['type'] = cereal_data_prep['type'].map({'C': 0, 'H' : 1})
cereal_data_prep.head()

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,0,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,0,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,0,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,0,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,0,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [69]:
num_clusters = 3
cereal_data_prep_np = cereal_data_prep.to_numpy()
scaler = StandardScaler().fit(cereal_data_prep_np)
cereal_data_prep_np = scaler.transform(cereal_data_prep_np)
selector = VarianceThreshold(threshold=0.2).fit(cereal_data_prep_np)
print(cereal_data_prep_np.shape)
cereal_data_prep_np = selector.transform(cereal_data_prep_np)
print(cereal_data_prep_np.shape)
kmeans = KMeansHW(n_clusters=num_clusters)
kmeans.fit(cereal_data_prep_np)

(77, 14)
(77, 14)


<__main__.KMeansHW at 0x2290912bcd0>

In [70]:
centroids = scaler.inverse_transform(selector.inverse_transform(kmeans.cluster_centers_))
centroids_df = pd.DataFrame(data=centroids, columns=cereal_data_prep.columns)
centroids_df.head()

Unnamed: 0,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,0.04,114.8,3.36,1.68,154.8,4.148,12.68,7.8,169.2,29.0,2.84,1.138,0.634,44.324106
1,-1.387779e-17,110.909091,1.545455,1.045455,170.0,0.590909,12.5,11.318182,45.181818,25.0,1.818182,1.0,0.869545,28.921286
2,0.06666667,97.333333,2.6,0.433333,156.166667,1.633333,17.733333,2.966667,72.466667,30.0,1.966667,0.961,0.941333,51.362945


In [71]:
cereal_data['cluster'] = kmeans.labels_
cereal_data = cereal_data.sort_values('cluster')
cereal_data.groupby('cluster').head(3)

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,cluster
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0
28,Fruitful Bran,K,C,120,3,0,240,5.0,14.0,12,190,25,3,1.33,0.67,41.015492,0
27,Fruit & Fibre Dates; Walnuts; and Oats,P,C,120,3,2,160,5.0,12.0,10,200,25,3,1.25,0.67,40.917047,0
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.0,27.753301,1
48,Nut&Honey Crunch,K,C,120,2,1,190,0.0,15.0,9,40,25,2,1.0,0.67,29.924285,1
37,Honey-comb,P,C,110,1,0,180,0.0,14.0,11,35,25,1,1.0,1.33,28.742414,1
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,2
61,Rice Chex,R,C,110,1,0,240,0.0,23.0,2,30,25,1,1.0,1.13,41.998933,2
62,Rice Krispies,K,C,110,2,0,290,0.0,22.0,3,35,25,1,1.0,1.0,40.560159,2


In [72]:
cereal_data['cluster'].value_counts()

2    30
0    25
1    22
Name: cluster, dtype: int64

# Raport

W preprocessingu użyliśmy StandardScaler oraz VarianceThreshold z progiem 0.2

Rozmiary grup to:

0.    25
1.    22
2.    30

Odnaleziony grupy skupiają płatki z podobną oceną oraz półką, a w składzie istotne są błonnik oraz cukier i potas. Grupę 2 można uznać za płatki bardziej dietetyczne, a grupę 0 za płatki którymi można się najeść. Grupa 1 to płatki których nikt za bardzo nie lubi.