# Optimization project: clustering

### We import relevant packages

In [1]:
import numpy as np
from numpy import random
import pandas as pd
from sklearn.neighbors import DistanceMetric
import functools
import operator
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance

__Given that we don't have the proper datafiles to run the code, we will use synthetic ones to still do the work, and
as soon as we have the real files we will just substitute them.__

In [2]:
#Simulate dataframe
def simulate_data_2(N):
    X1 = np.random.uniform(low=0, high=25, size=(N))
    X2 = np.random.uniform(low=0, high=25, size=(N))
    return X1, X2

In [3]:
X1,X2 = simulate_data_2(200)

In [4]:
df = pd.DataFrame({'X1': X1, 'X2': X2}, columns=['X1', 'X2'])

In [None]:
def simulate_data_5(N):
    X1 = np.random.uniform(low=0, high=25, size=(N))
    X2 = np.random.uniform(low=0, high=25, size=(N))
    X3 = np.random.uniform(low=0, high=25, size=(N))
    X4 = np.random.uniform(low=0, high=25, size=(N))
    X5 = np.random.uniform(low=0, high=25, size=(N))
    return X1, X2, X3, X4, X5 

# Minimum Spanning Tree Clustering


### Computing all the distances

In [5]:
from scipy.spatial import distance
def compute_dist(df):
    result = []
    for i in range(len(df)):
        for j in range(i+1,len(df)):
            # We use the package "distance" to compute the euclidean distance between
            # the different points in the dataset
            d = distance.euclidean(df.iloc[i].to_list(),df.iloc[j].to_list())
            result.append([i, j, d])

    ordered_result = sorted(result, key=lambda t: t[::-1])
    return ordered_result

In [6]:
ordered_result = compute_dist(df)

In [18]:
ordered_result

[[6, 19, 0.01912126936058454],
 [48, 119, 0.058788560775773345],
 [84, 121, 0.08428137886499092],
 [15, 126, 0.09415653794692111],
 [132, 176, 0.1600210694461165],
 [80, 83, 0.23764829828874387],
 [119, 176, 0.27766761088108394],
 [104, 188, 0.29126864852169704],
 [11, 74, 0.29246774058515157],
 [7, 163, 0.3151731526172911],
 [48, 176, 0.33561202325335965],
 [148, 172, 0.35076654082703823],
 [120, 180, 0.35765714515719677],
 [152, 154, 0.360160742772844],
 [35, 113, 0.3611177096443491],
 [67, 159, 0.36325509337443807],
 [100, 163, 0.36618395728234554],
 [41, 74, 0.37645888733188354],
 [119, 132, 0.3814466908387003],
 [72, 167, 0.38686957041180864],
 [38, 42, 0.4103629504958999],
 [48, 132, 0.4322966283509476],
 [64, 137, 0.4580427029971367],
 [145, 196, 0.4596581008738265],
 [4, 101, 0.4618967846831789],
 [10, 62, 0.47297229615437014],
 [29, 40, 0.4812637909122867],
 [91, 128, 0.4914432833285339],
 [38, 63, 0.5066983412776205],
 [98, 173, 0.5235084343306585],
 [86, 149, 0.5243869532162

To find the minimum spanning tree it makes sense to differentiate the following cases:

    case 1: None of them are in a cluster:
    case 2: Only one of them is already in a cluster:
    case 3: Both of them are already in a cluster:
        --> 3a: Both of them are in the same cluster: do nothing
        --> 3b: They are in different clusters: merge the 2 corresponding clusters


In [81]:
def MST_clustering(K, N, ordered_result):
    # We initialize the variables that we will need in the outter for loop
    cluster_dict = {}
    k = 0
    vertex = []
    counter = 0
    i = 0
    final_number_clusters = N-K
    for ite in ordered_result:
        
        # Here we set the condition to include as clusters the points that are left alone in the algorithm. 
        # At this point we will iterate through the rest of edges and we will add the points which have not been
        # visited before (if they have been visited, they already are in one component of the MST)
        
        if counter == final_number_clusters:
            for ite in ordered_result[i:]:
                if ite[0] not in vertex:
                    k += 1
                    cluster_dict[k] = [ite[0]]
                    vertex.append(ite[0])
                if ite[1] not in vertex:
                    k += 1
                    cluster_dict[k] = [ite[1]]  
                    vertex.append(ite[1])
            return cluster_dict
    
        # Here we want to know whether the vertices of the edge of this iteartion have already been included in any
        # of the components of the MST. We assign a key to these two vertices. We assign key = -1 to the vertices 
        # that have not been visited before, and if the vertex have already been assigned to a cluster then we assign
        # to him its component/cluster, so key = cluster.
        
        key_0 = -1
        key_1 = -1
        
        if ite[0] not in vertex:
            pass
        else:
            for cluster in cluster_dict:
                if ite[0] in cluster_dict[cluster]:
                    key_0 = cluster
                    
        if ite[1] not in vertex:
            pass
        else:
            for cluster in cluster_dict:
                if ite[1] in cluster_dict[cluster]:
                    key_1 = cluster

                    
        # Now, we have four different cases. (a) None of the vertices have been added to a cluster, (b) one of the
        # vertices has been added to a cluster, (c) both of them have been added to the same cluster (so we will not
        # do anything given that we would be creating a cycle) and (d) both of them have been added to a cluster, but
        # each of them is in a different cluster.
        
        # case (a)
        
        if (key_0 == -1) and (key_1 == -1):
            k += 1
            cluster_dict[k] = [ite[0]] 
            cluster_dict[k] += [ite[1]]
            counter +=1
            vertex.append(ite[0])
            vertex.append(ite[1])
            
        # case (b)
        
        elif (key_0 == -1) and (key_1 != -1):
            cluster_dict[key_1] += [ite[0]]
            counter +=1
            vertex.append(ite[0])
        elif (key_0 != -1) and (key_1 == -1):
            cluster_dict[key_0] += [ite[1]]
            counter +=1
            vertex.append(ite[1])
            
        # case (c) and (d)
        else:
            
            # case (c)
            if key_0 == key_1:
                pass
            
            # case (d)
            else:
                cluster_dict[key_0] += cluster_dict[key_1]
                del cluster_dict[key_1]
                counter +=1
        i +=1
    return cluster_dict

In [82]:
%%time
for k in range(2,201):
    cluster_dict = MST_clustering(k, 200, ordered_result)

CPU times: user 27.2 s, sys: 135 ms, total: 27.3 s
Wall time: 27.7 s


In [76]:
cluster_dict = MST_clustering(200, 200, ordered_result)

In [77]:
len(cluster_dict)

200

In [None]:
def get_df_2(cluster_dict):
    ind_list = []
    clust_list = []

    for k, v in cluster_dict.items():
        [clust_list.append(k) for i in v]
        [ind_list.append(val) for val in v]

    df_s = pd.DataFrame()
    df_s['index'] = ind_list
    df_s['Cluster'] = clust_list
    
    df_s = df_s.sort_values('index').set_index('index')
    
    df_s['X1'] = X1
    df_s['X2'] = X2
    return df_s

In [None]:
def get_df_5(cluster_dict):
    ind_list = []
    clust_list = []

    for k, v in cluster_dict.items():
        [clust_list.append(k) for i in v]
        [ind_list.append(val) for val in v]

    df_s = pd.DataFrame()
    df_s['index'] = ind_list
    df_s['Cluster'] = clust_list
    
    df_s = df_s.sort_values('index').set_index('index')
    
    df_s['X1'] = X1
    df_s['X2'] = X2
    df_s['X3'] = X3
    df_s['X4'] = X4
    df_s['X5'] = X5
    return df_s

# Davies Bouldin algorithm

In [None]:
def Davies_Bouldin(cluster_dict,df):
    clusters = []
    for n in cluster_dict:
        clusters.append(n)
    centroid_dict = {}
    for n in clusters:
        mask = (df.Cluster == n)
        nice = df[mask]
        centroid = nice.mean()
        centroid = centroid.to_list()
        list_ = []
        for el in range(0,len(list(df.columns[:-1]))):
            list_.append(centroid[el])
            centroid_dict[n] = list_
    S_i = {}
    for n in cluster_dict:
        sum_ = 0
        for el in cluster_dict[n]:
            sum_ += distance.euclidean(df.iloc[el][:-1],centroid_dict[n])
            average = sum_/len(cluster_dict[n])
            S_i[n] = average
    M_ij = {}
    centroids = list(centroid_dict.keys())
    for i in range(0,len(centroids)):
        for j in range(i+1,len(centroids)):
            d = distance.euclidean(centroid_dict[centroids[i]],centroid_dict[centroids[j]])
            M_ij[(centroids[i],centroids[j])] = d
    dispersion = list(S_i.keys())
    D_i = {}
    for i in range(0, len(dispersion)):
        D_i[dispersion[i]] = 0
        for j in range(0,len(dispersion)):
            if i!=j:
                try:
                    R_ij = (S_i[dispersion[i]]+S_i[dispersion[j]])/(M_ij[(dispersion[i],dispersion[j])])
                    if R_ij >= D_i[dispersion[i]]:
                        D_i[dispersion[i]] = R_ij
                except:
                    R_ij = (S_i[dispersion[i]]+S_i[dispersion[j]])/(M_ij[(dispersion[j],dispersion[i])])
                    if R_ij >= D_i[dispersion[i]]:
                        D_i[dispersion[i]] = R_ij
    count = 0
    for n in D_i:
        count += D_i[n]
    DB = count/len(D_i)
    return DB

In [None]:
#def MST_clustering_simon(K, ordered_result):
#    cluster_dict = {}
#    cluster_count = 1
#    edge_count = 0
#    visited = []
#    
#    
#    for ite in ordered_result:
#        # termination condition: if we are at K clusters, we just add the unvisited vertices as lone clusters
#        if edge_count == 200 - K:
#            for i in range(2):
#                if ite[i] not in visited:
#                    cluster_dict[cluster_count] = [ite[i]]
#                    visited.append(ite[i])
#                    cluster_count += 1
#        # case 1:
#        elif ite[0] not in visited and ite[1] not in visited:
#            cluster_dict[cluster_count] = ite[:2]
#            [visited.append(n) for n in ite[:2]]
#            cluster_count += 1
#            edge_count += 1
#        # case 2:
#        elif ite[0] in visited and ite[1] not in visited:
#            for key, v in cluster_dict.items():
#                if ite[0] in v:
#                    cluster_dict[key].append(ite[1])
#                    visited.append(ite[1])
#                    edge_count += 1
#        # case 2b: The other one:
#        elif ite[1] in visited and ite[0] not in visited:
#            for key, v in cluster_dict.items():
#                if ite[1] in v:
#                    cluster_dict[key].append(ite[0])
#                    visited.append(ite[0])
#                    edge_count += 1
#        # case 3:
#        elif ite[0] in visited and ite[1] in visited:
#            for num in range(2):
#                for key, v in cluster_dict.items():
#                    if ite[num] in v:
#                        # case 3: They are in different clusters: merge the 2 corresponding clusters
#                        if ite[1-num] not in v:
#                            if num == 0:
#                                #cluster_dict[key].append(ite[num])
#                                store_cluster = cluster_dict[key].copy()
#                                cluster_dict[key] = []
#                            elif num == 1:
#                                [cluster_dict[key].append(n) for n in store_cluster]
#                                edge_count += 1
#                                
#    # drop the keys where values were added to another key due to merging two clusters
#    cluster_dict = {k: v for k, v in cluster_dict.items() if v != []}
#    return cluster_dict

# The Dunn Index

In [None]:
def max_within(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[-i-1][0] in v and ordered_result[-i-1][1] in v:
                return ordered_result[-i-1][2]

def min_between(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[i][0] in v and ordered_result[i][1] not in v:
                return ordered_result[i][2]

def dunn(ordered_result, cluster_dict):
    num = max_within(ordered_result, cluster_dict)
    den = min_between(ordered_result, cluster_dict)
    return num/den

# Running the pipeline:

### For the dataset with 2 columns ('Synthetic')

In [None]:
N = 200
X1, X2 = simulate_data_2(N)
df = pd.DataFrame({'X1': X1, 'X2': X2}, columns=['X1', 'X2'])
ordered_result = compute_dist(df)

def main_MST(k, df):

    cluster_dict = MST_clustering(k, N, ordered_result)
    
    df['Cluster']= get_df_2(cluster_dict)['Cluster']
    
    DB = Davies_Bouldin(cluster_dict,df)

    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB, Dunn

In [None]:
DB = main_MST(15,df)[0]
Dunn = main_MST(15,df)[1]

### For the dataset with 5 columns ('Thyroid')

In [None]:
N = 200
X1, X2, X3, X4, X5 = simulate_data_5(N)
df = pd.DataFrame({'X1': X1, 'X2': X2,'X3': X3, 'X4': X4, 'X5': X5}, columns=['X1', 'X2','X3', 'X4', 'X5'])
ordered_result = compute_dist(df)

def main(K, df):

    cluster_dict = MST_clustering(K, N, ordered_result)
    
    df['Cluster']= get_df_5(cluster_dict)['Cluster']
    
    DB = Davies_Bouldin(cluster_dict,df)

    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB, Dunn

n_clusters = []
DB = []
Dunn = []

for K in range(2, N-1, 10):
    n_clusters.append(K)
    DB.append(main(K, df)[0])
    Dunn.append(main(K,df)[1])

In [None]:
import seaborn as sns
sns.lineplot(n_clusters,DB)
plt.show()
sns.lineplot(n_clusters,Dunn)
plt.show()

## K-Means Clustering

In [None]:
# Randomly assign indeces of cluster centroids:
def initiate_centroids(n, df):
    centroids = []
    random.seed(42) # use this since we want to compare the different number of clusters
    
    #generate random centroid indeces
    initial_index_centroid = random.sample(range(0, len(df)), n)
    
    #find the data points corresponding the the indeces:
    for i in initial_index_centroid:
        centroids.append(df.loc[i])
    return centroids


# To calculate the distance between two points:
def calc_distance(X1, X2):
    return(sum((X1 - X2)**2))**0.5


# To find the closest centroid to each data point:
def findClosestCentroids(centroids, df):
    assigned_centroid = []
    
    #iterate over every data point in the dataframe:
    for index, row in df.iterrows():
        distance=[]
        
        #find distance of data point with each cluster:
        for center in centroids:
            distance.append(calc_distance(row[:-1], center[:-1]))
            
        #assign data point to closest cluster:
        assigned_centroid.append(np.argmin(distance))
    return assigned_centroid


#To update the centroid of the clusters:
def calc_centroids(clusters, df):
    
    #initiate empty list for new centroids of each cluster:
    new_centroids = []
    
    #df including each point and its respective cluster
    new_df = pd.concat([pd.DataFrame(df), pd.DataFrame(clusters, columns=['cluster'])],
                      axis=1)
    
    #iterate over the distinct clusters
    for c in set(new_df['cluster']):
        
        #take out the data points corresponding to each cluster:
        current_cluster = new_df[new_df['cluster'] == c][new_df.columns[:-1]]
        
        #find the new cluster centroid which is the mean of the clusters we already assigned
        cluster_mean = current_cluster.mean(axis=0)
        
        #append the new centroid
        new_centroids.append(cluster_mean)
        
    return new_centroids


#Recursively find and update cluster centroids:
#n: number of clusters, df: dataframe of data points, iterations: number of iterations
def recursive_centroid_find(n, df, iterations):
    #initiate centroids:
    centroids = initiate_centroids(n, df)
    # Recursively call the functions again to update the mean of the clusters:
    for i in range(iterations):
        get_centroids = findClosestCentroids(centroids, df)
        centroids = calc_centroids(get_centroids, df)
        #print(pd.DataFrame(centroids))
        
        #plot the centroids after every iteration:
        #plt.figure()
        #plt.scatter(np.array(centroids)[:, 0], np.array(centroids)[:, 1], color='red')
        #plt.scatter(df.X1, df.X2, alpha=0.1)
        #plt.show()
    return pd.DataFrame(centroids)

In [None]:
def get_cluster_dict(df):
    cluster_dict = {}
    visited = []
    for n in range(0,len(df)):
        if int(df.iloc[n][2]) in visited:
            cluster_dict[int(df.iloc[n][2])] += [n]
        else:
            cluster_dict[int(df.iloc[n][2])] = [n]
            visited.append(int(df.iloc[n][2]))
    return cluster_dict

# Running the pipeline:

### For the dataset with 2 columns ('Synthetic')

In [None]:
N = 200
X1, X2 = simulate_data_2(N)
df = pd.DataFrame({'X1': X1, 'X2': X2}, columns=['X1', 'X2'])

def main_KMeans(k, df):
    
    centroids = initiate_centroids(k, df)
    df['Cluster'] = findClosestCentroids(centroids, df)
    cluster_dict = get_cluster_dict(df)
    DB = Davies_Bouldin(cluster_dict,df)
    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB, Dunn

In [None]:
DB, Dunn = main_KMeans(15,df)

In [None]:
DB

In [None]:
Dunn

### For the dataset with 5 columns ('Thyroid')

In [None]:
N = 200
X1, X2, X3, X4, X5 = simulate_data_5(N)
df = pd.DataFrame({'X1': X1, 'X2': X2,'X3': X3, 'X4': X4, 'X5': X5}, columns=['X1', 'X2','X3', 'X4', 'X5'])

def main_KMeans(k, df):
    
    centroids = initiate_centroids(k, df)
    df['Cluster'] = findClosestCentroids(centroids, df)
    cluster_dict = get_cluster_dict(df)
    DB = Davies_Bouldin(cluster_dict,df)
    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB, Dunn

In [None]:
n_clusters = []
DB = []
Dunn = []
for K in range(2, N-1, 10):
    n_clusters.append(K)
    DB.append(main(K, df)[0])
    Dunn.append(main(K,df)[1])

In [None]:
import seaborn as sns
sns.lineplot(n_clusters,DB)
plt.show()
sns.lineplot(n_clusters,Dunn)
plt.show()