In [1]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
import statistics

#### import data

In [2]:
df = pd.read_csv('cluster.csv', header = None)

# convert dataframe into numpy array
X = df.values

In [3]:
X.shape

(60, 50)

#### Standardize the data points

In [3]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_sc = sc.fit_transform(X)

 #### Define a function to calculate distances to centroids for all data points - can be further optimized (with the np.vectorize function etc.)

In [4]:
def distance(points1, points2):
    distances = np.zeros(shape=(points1.shape[0], points2.shape[0]))
    distanceToCentroids = lambda point: \
        np.array([np.linalg.norm((point - centroid), ord=1) for centroid in points2],dtype = float)

    # Loop over data points:
    for i in range(points1.shape[0]):
        distances[i,] = distanceToCentroids(points1[i, ])
    
    return(distances)

#### Define the function to iteratively get find the cluster centroids

In [51]:
def k_centroids(X, n_clusters, init='k-means++'
                , precompute_distances='auto', n_init=10
                , max_iter=1000, tol=1e-4
#                 , random_state=None, n_jobs=1
#                 ,algorithm="auto", return_n_iter=False
               ):
    # X : array-like matrix, already standardized
    
    n_samples = X.shape[0]
    n_features = X.shape[1]
    
    # initialize the centroids
    # random sampling, can use other methods like kmeans++
    centroids = X[random.sample(range(n_samples), n_clusters), ]
    
    # calculate the distances to clusters, clustering labels and the resulting inertia
    distances = distance(X, centroids)
    labels = np.argmin(distances, axis = 1)
    inertia = np.sum((X - centroids[labels])**2, dtype=np.float64)
    
    # initiate an array to store the silhouette scores for all data points
    silhouettes = np.zeros(shape=(n_samples, )) 
    
    for iteration in range(max_iter):
        
        n_samples_in_cluster = np.bincount(labels, minlength = n_clusters)

        # calculate the new centroids by taking the medians of data points in the same cluster
       
        centroids = np.zeros(shape = (n_clusters, n_features))
        
        for j in range(n_features):
            adic = defaultdict(list)
            for i in range(n_samples):
                adic[labels[i]].append(X[i, j])
            for each_label in np.unique(labels):
                centroids[each_label, j] = statistics.median(adic[each_label])

        distances = distance(X, centroids)
        labels = np.argmin(distances, axis = 1)
        inertia_previous = inertia
        inertia = np.sum((X - centroids[labels])**2, dtype=np.float64)
        
        # calculate the silhouette scores for each data point based on the new clusters
        for i in range(n_samples):
            a_i = np.average(np.ma.masked_equal(distance(X[labels == labels[i,], ], X[i:i+1,]),0))
            b_i = np.min(distance(X[labels != labels[i,], ], X[i:i+1,]))
            silhouettes[i,] = (b_i - a_i)/np.max([a_i, b_i])

        if (np.linalg.norm(inertia - inertia_previous) < tol):
            break
        
    return(centroids, inertia, labels, silhouettes)

In [52]:
centroids, inertia, labels, silhouettes = k_centroids(X_sc, 3)

In [53]:
from sklearn.metrics import silhouette_samples, silhouette_score
silhouette_avg = silhouette_score(X_sc, labels)
sample_silhouette_values = silhouette_samples(X_sc, labels)

In [54]:
silhouette_avg

0.84040314629599244

In [55]:
np.average(silhouettes)

0.87246024908159348

In [56]:
np.vstack((labels, silhouettes))

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ],
       [ 0.88457031,  0.8854885 ,  0.89137686,  0.88966028,  0.88329108,
         0.88233324,  0.88372468,  0.89408677,  0.