In [1]:
import pandas as pd
import numpy as np
import random

#### import data

In [2]:
df = pd.read_csv('cluster.csv', header = None)

# convert dataframe into numpy array
X = df.values

#### Standardize the data points

In [3]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_sc = sc.fit_transform(X)

 #### Define a function to calculate distances to centroids for all data points - can be further optimized (with the np.vectorize function etc.)

In [80]:
def distance(points1, points2):
    distances = np.zeros(shape=(points1.shape[0], points2.shape[0]))
    distanceToCentroids = lambda point: \
        np.array([np.linalg.norm(point - centroid) for centroid in points2],dtype = float)

    # Loop over data points:
    for i in range(points1.shape[0]):
        distances[i,] = distanceToCentroids(points1[i, ])
    
    return(distances)

#### Define the function to iteratively get find the cluster centroids

In [168]:
def k_centroids(X, n_clusters, init='k-means++'
                , precompute_distances='auto', n_init=10
                , max_iter=300, tol=1e-4
#                 , random_state=None, n_jobs=1
#                 ,algorithm="auto", return_n_iter=False
               ):
    # X : array-like matrix, already standardized
    
    n_samples = X.shape[0]
    n_features = X.shape[1]
    
    # initialize the centroids
    # random sampling, can use other methods like kmeans++
    centroids = X[random.sample(range(n_features), n_clusters), ]
    
    # calculate the distances to clusters, clustering labels and the resulting inertia
    distances = distance(X, centroids)
    labels = np.argmin(distances, axis = 1)
    inertia = np.sum((X - centroids[labels])**2, dtype=np.float64)
    
    # initiate an array to store the silhouette scores for all data points
    silhouettes = np.zeros(shape=(n_samples, )) 
    
    for iteration in range(max_iter):
        
        n_samples_in_cluster = np.bincount(labels, minlength = n_clusters)

        # calculate the new centroids by taking the means of data points in the same cluster
        # this can be customized with other approaches like KMedoids 
        centroids = np.zeros(shape = (n_clusters, n_features))
        for i in range(n_samples):
            for j in range(n_features):
                centroids[labels[i], j] += X[i, j]

        centroids /= n_samples_in_cluster[:, np.newaxis]
        distances = distance(X, centroids)
        labels = np.argmin(distances, axis = 1)
        inertia_previous = inertia
        inertia = np.sum((X - centroids[labels])**2, dtype=np.float64)
        
        # calculate the silhouette scores for each data point based on the new clusters
        for i in range(n_samples):
            a_i = np.average(np.ma.masked_equal(distance(X[labels == labels[i,], ], X[i:i+1,]),0))
            b_i = np.min(distance(X[labels != labels[i,], ], X[i:i+1,]))
            silhouettes[i,] = (b_i - a_i)/np.max([a_i, b_i])

        if (np.linalg.norm(inertia - inertia_previous) < tol):
            break
        
        return(centroids, inertia, labels, silhouettes)

In [169]:
centroids, inertia, labels, silhouettes = k_centroids(X_sc, 3)

In [174]:
from sklearn.metrics import silhouette_samples, silhouette_score
silhouette_avg = silhouette_score(X_sc, labels)
sample_silhouette_values = silhouette_samples(X_sc, labels)

In [178]:
silhouette_avg

0.84040314629599244

In [177]:
np.average(silhouettes)

0.84472518571001864

In [170]:
np.vstack((labels, silhouettes))

array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ],
       [ 0.85842484,  0.86265703,  0.87066743,  0.86483653,  0.8591962 ,
         0.86005138,  0.85930927,  0.87219249,  0.