In [35]:
import numpy as np
import pandas as pd

import os
np.random.seed(42)

# Loads dataset & processes it:
# - fills NA data
# - processes categorical data so that categories from both train&test are known
def load_dataset(dataset, drop_columns=None):
    df_train = pd.read_csv("./2019-npfl104-shared/data/"+dataset+"/train.txt.gz", header=None)
    df_test = pd.read_csv("./2019-npfl104-shared/data/"+dataset+"/test.txt.gz", header=None)

    train_size = len(df_train)
    df_tog = df_train.append(df_test)

    # Convert to categorical
    for col in df_tog.columns[np.where(df_tog.dtypes == 'object')]:
        df_tog[col] = pd.Categorical(df_tog[col])

        
    # Explicitely drop specified columns
    if drop_columns:
        df_tog = df_tog.drop(drop_columns, axis=1)


    df_train, df_test = df_tog[:train_size], df_tog[train_size:]
    
    df_train = df_train.fillna(df_train.mode().iloc[0])
    df_test = df_test.fillna(df_test.mode().iloc[0])
    
    return df_train, df_test

# Used to split dataframe to features & target (last column)
def get_X(df):
    return pd.get_dummies(df[df.columns[:-1]], dummy_na=True)
def get_Y(df):
    dfc = df[df.columns[-1]]
    return dfc.cat.codes if dfc.dtype.name == "category" else dfc


In [36]:
dftr, dfte = load_dataset("pamap-easy")    

In [19]:
k = 8
x = get_X(dftr).values


def init_centers():
    centers = np.zeros((k, ) + x[0].shape)
    for i in range(k):
        random_i = np.random.choice(len(x) - 1, 1)
        centers[i] = x[random_i]
                
    return centers
    
print(init_centers())

[[102.      33.3125]
 [133.      33.625 ]
 [ 87.      32.375 ]
 [ 82.      31.8125]
 [149.      30.9375]
 [100.      33.3125]
 [ 95.      32.5   ]
 [124.      32.625 ]]


In [20]:
def get_closest_center(centers, data, j):
    closest = -1
    closest_dist = 2 ** 31
    for i in range(len(centers)):
        dist = np.linalg.norm(data[j] - centers[i])
        if (dist < closest_dist): 
            closest_dist = dist
            closest = i
    return closest


def iteration(data, centers):
    aggr = np.zeros((k, ) + data[0].shape)
    aggr_n = np.zeros(k)
    
    for i in range(len(data)):
        clst_cluster = get_closest_center(centers, x, i)
    
        aggr[clst_cluster] += x[i]
        aggr_n[clst_cluster] += 1
        
    for i in range(len(aggr)):
        aggr[i] /= aggr_n[i]

    return aggr
   


In [21]:
centers = init_centers()
for i in range(10):
    centers = iteration(x, centers)
    
centers

array([[123.81978482,  32.76843619],
       [133.15526802,  32.74905653],
       [ 85.04688763,  31.75290521],
       [115.68898305,  33.20429025],
       [146.83487085,  32.48653713],
       [170.21771075,  31.23786736],
       [ 95.12788345,  32.55857952],
       [103.09825175,  32.85585664]])

In [22]:
def predict(centers, data):
    result = np.zeros(len(data))
    for i in range(len(data)):
        result[i] = get_closest_center(centers, data, i)
        
    return result
        
labels_myknn = predict(centers, x)
    

In [23]:
from sklearn import metrics
from time import time


In [24]:
class Object(object):
    pass




In [25]:
t0 = time()
estimator = Object()
estimator.inertia_ = 6
estimator.labels_ = labels_myknn

name="asdf"
data = get_X(dftr)
labels = get_Y(dftr)
sample_size=len(dftr)

print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))


init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette




asdf     	0.01s	6	0.482	0.581	0.527	0.337	0.481	0.528


In [44]:
import numpy as np
from models.model import Model

class KMeansMy(Model):
    '''
    KMeans clustering model.
    '''
    def __init__(self, k, iters):
        self.k = k
        self.iters = iters

        self.inertia_ = 25

    def Build(self, inputs):
        centers = self.__init_centers(inputs)
        for _ in range(self.iters):
            centers = self.__iteration(inputs, centers)
        self.centers = centers


    def __init_centers(self, inputs):
        x = inputs
        k = self.k

        centers = np.zeros((k, ) + x[0].shape)
        for i in range(k):
            random_i = np.random.choice(len(x) - 1, 1)[0]
            centers[i] = x[random_i]
                
        return centers

    def __get_closest_center(self, centers, data, j):
        closest = -1
        closest_dist = 2 ** 31
        for i in range(len(centers)):
            dist = np.linalg.norm(data[j] - centers[i])
            if (dist < closest_dist): 
                closest_dist = dist
                closest = i
        return closest


    def __iteration(self, data, centers):
        k = self.k

        aggr = np.zeros((k, ) + data[0].shape)
        aggr_n = np.zeros(k)

        for i in range(len(data)):
            clst_cluster = self.__get_closest_center(centers, data, i)

            aggr[clst_cluster] += data[i]
            aggr_n[clst_cluster] += 1

        for i in range(len(aggr)):
            aggr[i] /= aggr_n[i]

        return aggr



    def fit(self, data):
        self.Build(data)
        self.labels_ = self.Predict(data)

    def Predict(self, input):
        centers = self.centers
        data = input

        result = np.zeros(len(data))
        for i in range(len(data)):
            result[i] = self.__get_closest_center(centers, data, i)
        
        return result


def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))


In [45]:
from sklearn import metrics
from time import time

from sklearn.cluster import KMeans
k = 10
model = KMeansMy(k, 5)

    
labels = get_Y(dftr).values
sample_size = len(dftr)

print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
bench_k_means(model, "asdf", data = get_X(dftr).values)
bench_k_means(KMeans(init='k-means++', n_clusters=k, n_init=2),
              name="k-means++", data=data)
bench_k_means(KMeans(init='random', n_clusters=k, n_init=2),
              name="random", data=data)

init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette




asdf     	11.90s	25	0.513	0.576	0.543	0.390	0.512	0.503




k-means++	0.09s	171580	0.510	0.561	0.534	0.357	0.509	0.542




random   	0.15s	188862	0.526	0.596	0.559	0.405	0.526	0.542
