In [1]:
import pandas as pd
import numpy as np
import random

#### import data

In [2]:
df = pd.read_csv('cluster.csv', header = None)

# convert dataframe into numpy array
X = df.values

#### Standardize the data points

In [3]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_sc = sc.fit_transform(X)

 #### Define a function to calculate distances to centroids for all data points - can be further optimized (with the np.vectorize function etc.)

In [4]:
def distance(points1, points2):
    distances = np.zeros(shape=(points1.shape[0], points2.shape[0]))
    distanceToCentroids = lambda point: \
        np.array([np.linalg.norm(point - centroid) for centroid in points2],dtype = float)

    # Loop over data points:
    for i in range(points1.shape[0]):
        distances[i,] = distanceToCentroids(points1[i, ])
    
    return(distances)

#### Define the function to iteratively get find the cluster centroids

In [5]:
def khmeans(X, n_clusters, p=2, max_iter=100, tol=1e-4):
        
    # X : array-like matrix, already standardized
    n_samples = X.shape[0]
    n_features = X.shape[1]
        
    # initialize centroids randomly
    centroids = X[random.sample(range(n_samples), n_clusters), ]
    
    # initialize an array to store the memebership values for all data points
    membership = np.zeros(shape=(n_samples, n_clusters))
    
    # initialize an array to store values of weight for all data points
    weight = np.zeros(shape=(n_samples, ))
    
    # initialize upper and lower part of the new center location calculation formula
    c_upper = np.zeros(shape=(n_clusters, n_features))
    c_lower = np.zeros(shape=(n_clusters, ))
    
    
    for iteration in range(max_iter):
        
        # calculate the objective function values of the KHM algorithm
        distances = distance(X, centroids)
        reciprocal = np.reciprocal(distances**p)
        reciprocal[np.isinf(reciprocal)] = 0
        obj = np.sum(n_clusters / np.sum(reciprocal, axis=1))
        
        # calculate the the grade of membership value of each data point to each centroid
        d = distances**(-1)
        d[np.isinf(d)] = 0
        for i in range(n_samples): 
            membership[i,] = (d**(p+2))[i]/np.sum(d**(p+2), axis=1)[i]
            
        # calculate the weight of each data point
        for i in range(n_samples):
            weight[i] = np.sum(d**(p+2), axis=1)[i]/(np.sum(d**p, axis=1)[i])**2
        
        # calculate the new center location with the membership and weight of each point
        for i in range(n_samples):
            c_upper += np.matmul((membership[i]*weight[i]).reshape(n_clusters,1),X[i].reshape(1,n_features))
            c_lower += membership[i]*weight[i]
        
        for i in range(len(c_upper)):
            centroids[i] = c_upper[i]/c_lower[i]
            
        obj_previous = obj
        distances = distance(X, centroids)
        labels = np.argmin(distances, axis = 1)
        reciprocal = np.reciprocal(distances**p)
        reciprocal[np.isinf(reciprocal)] = 0
        obj = np.sum(n_clusters / np.sum(reciprocal, axis=1))
        
            
        # stop if the objective function does not change significantly    
        if (np.linalg.norm(obj - obj_previous) < tol):
            break  
        
    return(centroids, obj, labels)

In [6]:
# calculate the silhouette scores for each data point based on the new clusters

def silhouettes(labels, n_samples):
    
    # initiate an array to store the silhouette scores for all data points
    silhouette = np.zeros(shape=(n_samples, ))
    
    for i in range(n_samples):
        a_i = np.average(np.ma.masked_equal(distance(X[labels == labels[i,], ], X[i:i+1,]),0))
        b_i = np.min(distance(X[labels != labels[i,], ], X[i:i+1,]))
        silhouette[i,] = (b_i - a_i)/np.max([a_i, b_i])
    return(silhouette)

In [7]:
centroids, obj, labels = khmeans(X_sc, n_clusters=3, p=4)



In [8]:
from sklearn.metrics import silhouette_samples, silhouette_score
silhouette_avg = silhouette_score(X_sc, labels)
sample_silhouette_values = silhouette_samples(X_sc, labels)

In [9]:
silhouette_avg

0.84040314629599244

In [13]:
np.average(silhouettes(labels=labels, n_samples=X_sc.shape[0]))

  if __name__ == '__main__':


0.84465510417649192

In [14]:
np.vstack((labels, silhouettes(labels=labels, n_samples=X_sc.shape[0])))

  if __name__ == '__main__':


array([[ 2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         2.        ,  2.        ,  2.        ,  2.        ,  2.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
       [ 0.85870553,  0.86266847,  0.87048271,  0.86510725,  0.85929719,
         0.86006186,  0.85892299,  0.87218171,  0.