# Optimization project: clustering

### We import relevant packages

In [1]:
import numpy as np
from numpy import random
import pandas as pd
from sklearn.neighbors import DistanceMetric
import functools
import operator
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from sklearn.metrics import davies_bouldin_score
import seaborn as sns

# Minimum Spanning Tree Clustering


### Computing all the distances

In [2]:
from scipy.spatial import distance
def compute_dist(df):
    result = []
    for i in range(len(df)):
        for j in range(i+1,len(df)):
            # We use the package "distance" to compute the euclidean distance between
            # the different points in the dataset
            d = distance.euclidean(df.iloc[i].to_list(),df.iloc[j].to_list())
            result.append([i, j, d])

    ordered_result = sorted(result, key=lambda t: t[::-1])
    return ordered_result

To find the minimum spanning tree it makes sense to differentiate the following cases:

    case 1: None of them are in a cluster:
    case 2: Only one of them is already in a cluster:
    case 3: Both of them are already in a cluster:
        --> 3a: Both of them are in the same cluster: do nothing
        --> 3b: They are in different clusters: merge the 2 corresponding clusters


In [3]:
def MST_clustering(K, N, ordered_result):
    # We initialize the variables that we will need in the outter for loop
    cluster_dict = {}
    k = 0
    vertex = []
    counter = 0
    i = 0
    final_number_clusters = N-K
    for ite in ordered_result:
        
        # Here we set the condition to include as clusters the points that are left alone in the algorithm. 
        # At this point we will iterate through the rest of edges and we will add the points which have not been
        # visited before (if they have been visited, they already are in one component of the MST)
        
        if counter == final_number_clusters:
            for ite in ordered_result[i:]:
                if ite[0] not in vertex:
                    k += 1
                    cluster_dict[k] = [ite[0]]
                    vertex.append(ite[0])
                if ite[1] not in vertex:
                    k += 1
                    cluster_dict[k] = [ite[1]]  
                    vertex.append(ite[1])
            return cluster_dict
    
        # Here we want to know whether the vertices of the edge of this iteartion have already been included in any
        # of the components of the MST. We assign a key to these two vertices. We assign key = -1 to the vertices 
        # that have not been visited before, and if the vertex have already been assigned to a cluster then we assign
        # to him its component/cluster, so key = cluster.
        
        key_0 = -1
        key_1 = -1
        
        if ite[0] not in vertex:
            pass
        else:
            for cluster in cluster_dict:
                if ite[0] in cluster_dict[cluster]:
                    key_0 = cluster
                    
        if ite[1] not in vertex:
            pass
        else:
            for cluster in cluster_dict:
                if ite[1] in cluster_dict[cluster]:
                    key_1 = cluster

                    
        # Now, we have four different cases. (a) None of the vertices have been added to a cluster, (b) one of the
        # vertices has been added to a cluster, (c) both of them have been added to the same cluster (so we will not
        # do anything given that we would be creating a cycle) and (d) both of them have been added to a cluster, but
        # each of them is in a different cluster.
        
        # case (a)
        
        if (key_0 == -1) and (key_1 == -1):
            k += 1
            cluster_dict[k] = [ite[0]] 
            cluster_dict[k] += [ite[1]]
            counter +=1
            vertex.append(ite[0])
            vertex.append(ite[1])
            
        # case (b)
        
        elif (key_0 == -1) and (key_1 != -1):
            cluster_dict[key_1] += [ite[0]]
            counter +=1
            vertex.append(ite[0])
        elif (key_0 != -1) and (key_1 == -1):
            cluster_dict[key_0] += [ite[1]]
            counter +=1
            vertex.append(ite[1])
            
        # case (c) and (d)
        else:
            
            # case (c)
            if key_0 == key_1:
                pass
            
            # case (d)
            else:
                cluster_dict[key_0] += cluster_dict[key_1]
                del cluster_dict[key_1]
                counter +=1
        i +=1
    return cluster_dict

In [4]:
def get_df_2(cluster_dict):
    ind_list = []
    clust_list = []

    for k, v in cluster_dict.items():
        [clust_list.append(k) for i in v]
        [ind_list.append(val) for val in v]

    df_s = pd.DataFrame()
    df_s['index'] = ind_list
    df_s['Cluster'] = clust_list
    
    df_s = df_s.sort_values('index').set_index('index')
    return df_s

In [5]:
def get_df_5(cluster_dict):
    ind_list = []
    clust_list = []

    for k, v in cluster_dict.items():
        [clust_list.append(k) for i in v]
        [ind_list.append(val) for val in v]

    df_s = pd.DataFrame()
    df_s['index'] = ind_list
    df_s['Cluster'] = clust_list
    
    df_s = df_s.sort_values('index').set_index('index')
    return df_s

# Davies Bouldin algorithm

In [6]:
def Davies_Bouldin(cluster_dict,df):
    clusters = []
    for n in cluster_dict:
        clusters.append(n)
    centroid_dict = {}
    for n in clusters:
        mask = (df.Cluster == n)
        nice = df[mask]
        centroid = nice.mean()
        centroid = centroid.to_list()
        list_ = []
        for el in range(0,len(list(df.columns[:-1]))):
            list_.append(centroid[el])
            centroid_dict[n] = list_
    S_i = {}
    for n in cluster_dict:
        sum_ = 0
        for el in cluster_dict[n]:
            sum_ += distance.euclidean(df.iloc[el][:-1],centroid_dict[n])
            average = sum_/len(cluster_dict[n])
            S_i[n] = average
    M_ij = {}
    centroids = list(centroid_dict.keys())
    for i in range(0,len(centroids)):
        for j in range(i+1,len(centroids)):
            d = distance.euclidean(centroid_dict[centroids[i]],centroid_dict[centroids[j]])
            M_ij[(centroids[i],centroids[j])] = d
    dispersion = list(S_i.keys())
    D_i = {}
    for i in range(0, len(dispersion)):
        D_i[dispersion[i]] = 0
        for j in range(0,len(dispersion)):
            if i!=j:
                try:
                    R_ij = (S_i[dispersion[i]]+S_i[dispersion[j]])/(M_ij[(dispersion[i],dispersion[j])])
                    if R_ij >= D_i[dispersion[i]]:
                        D_i[dispersion[i]] = R_ij
                except:
                    R_ij = (S_i[dispersion[i]]+S_i[dispersion[j]])/(M_ij[(dispersion[j],dispersion[i])])
                    if R_ij >= D_i[dispersion[i]]:
                        D_i[dispersion[i]] = R_ij
    count = 0
    for n in D_i:
        count += D_i[n]
    DB = count/len(D_i)
    return DB

# The Dunn Index

In [7]:
def max_within(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[-i-1][0] in v and ordered_result[-i-1][1] in v:
                return ordered_result[-i-1][2]

def min_between(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[i][0] in v and ordered_result[i][1] not in v:
                return ordered_result[i][2]

def dunn(ordered_result, cluster_dict):
    num = max_within(ordered_result, cluster_dict)
    den = min_between(ordered_result, cluster_dict)
    return num/den

# Running the pipeline:

### For the dataset with 2 columns ('Synthetic')

In [None]:
df = pd.read_csv('synthetic_clean.csv')
N = len(df)
ordered_result = compute_dist(df)

def main_MST(k, df):

    cluster_dict = MST_clustering(k, N, ordered_result)
    
    df['Cluster']= get_df_2(cluster_dict)['Cluster']
    
    DB = Davies_Bouldin(cluster_dict,df)

    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB, Dunn

In [None]:
DB, Dunn = main_MST(15,df)

### For the dataset with 5 columns ('Thyroid')

In [None]:
df = pd.read_csv('thyroid_clean.csv')
N = len(df)
ordered_result = compute_dist(df)

def main(K, df):

    cluster_dict = MST_clustering(K, N, ordered_result)
    
    df['Cluster']= get_df_5(cluster_dict)['Cluster']
    
    DB_MST = Davies_Bouldin(cluster_dict,df)

    Dunn_MST = dunn(ordered_result, cluster_dict)
    
    package_MST = davies_bouldin_score(df.drop('Cluster',axis=1), labels = df['Cluster'].to_list())
    
    return DB_MST, Dunn_MST, package_MST

n_clusters = []
DB_MST = []
Dunn_MST = []
Package_MST = []
for K in range(2, N-1, 10):
    n_clusters.append(K)
    DB_MST.append(main(K, df)[0])
    Dunn_MST.append(main(K,df)[1])
    Package_MST.append(main(K,df)[2])

In [None]:
import seaborn as sns
sns.lineplot(n_clusters,DB_MST)
plt.show()
sns.lineplot(n_clusters,Package_MST)
plt.show()
sns.lineplot(n_clusters,Dunn_MST)
plt.show()

## K-Means Clustering

In [8]:
# Randomly assign indeces of cluster centroids:
def initiate_centroids(n, df):
    centroids = []
    random.seed(42) # use this since we want to compare the different number of clusters
    
    #generate random centroid indeces
    initial_index_centroid = random.sample(range(0, len(df)), n)
    
    #find the data points corresponding the the indeces:
    for i in initial_index_centroid:
        centroids.append(df.loc[i])
    return centroids


# To calculate the distance between two points:
def calc_distance(X1, X2):
    return(sum((X1 - X2)**2))**0.5


# To find the closest centroid to each data point:
def findClosestCentroids(centroids, df):
    assigned_centroid = []
    
    #iterate over every data point in the dataframe:
    for index, row in df.iterrows():
        distance=[]
        
        #find distance of data point with each cluster:
        for center in centroids:
            distance.append(calc_distance(row[:-1], center[:-1]))
            
        #assign data point to closest cluster:
        assigned_centroid.append(np.argmin(distance))
    return assigned_centroid


#To update the centroid of the clusters:
def calc_centroids(clusters, df):
    
    #initiate empty list for new centroids of each cluster:
    new_centroids = []
    
    #df including each point and its respective cluster
    new_df = pd.concat([pd.DataFrame(df), pd.DataFrame(clusters, columns=['cluster'])],
                      axis=1)
    
    #iterate over the distinct clusters
    for c in set(new_df['cluster']):
        
        #take out the data points corresponding to each cluster:
        current_cluster = new_df[new_df['cluster'] == c][new_df.columns[:-1]]
        
        #find the new cluster centroid which is the mean of the clusters we already assigned
        cluster_mean = current_cluster.mean(axis=0)
        
        #append the new centroid
        new_centroids.append(cluster_mean)
        
    return new_centroids


#Recursively find and update cluster centroids:
#n: number of clusters, df: dataframe of data points, iterations: number of iterations
def recursive_centroid_find(n, df, iterations):
    #initiate centroids:
    centroids = initiate_centroids(n, df)
    # Recursively call the functions again to update the mean of the clusters:
    for i in range(iterations):
        get_centroids = findClosestCentroids(centroids, df)
        centroids = calc_centroids(get_centroids, df)
        #print(pd.DataFrame(centroids))
        
        #plot the centroids after every iteration:
        #plt.figure()
        #plt.scatter(np.array(centroids)[:, 0], np.array(centroids)[:, 1], color='red')
        #plt.scatter(df.X1, df.X2, alpha=0.1)
        #plt.show()
    return pd.DataFrame(centroids)

In [9]:
def get_cluster_dict_2(df):
    cluster_dict = {}
    visited = []
    for n in range(0,len(df)):
        if int(df.iloc[n][2]) in visited:
            cluster_dict[int(df.iloc[n][2])] += [n]
        else:
            cluster_dict[int(df.iloc[n][2])] = [n]
            visited.append(int(df.iloc[n][2]))
    return cluster_dict

In [10]:
def get_cluster_dict_5(df):
    cluster_dict = {}
    visited = []
    for n in range(0,len(df)):
        if int(df.iloc[n][5]) in visited:
            cluster_dict[int(df.iloc[n][5])] += [n]
        else:
            cluster_dict[int(df.iloc[n][5])] = [n]
            visited.append(int(df.iloc[n][5]))
    return cluster_dict

# Running the pipeline:

### For the dataset with 2 columns ('Synthetic')

In [None]:
N = 200
X1, X2 = simulate_data_2(N)
df = pd.DataFrame({'X1': X1, 'X2': X2}, columns=['X1', 'X2'])

def main_KMeans(k, df):
    
    centroids = initiate_centroids(k, df)
    df['Cluster'] = findClosestCentroids(centroids, df)
    cluster_dict = get_cluster_dict_2(df)
    DB_KMeans = Davies_Bouldin(cluster_dict,df)
    Dunn = dunn(ordered_result, cluster_dict)
    
    return DB_KMeans, Dunn_KMeans

In [None]:
DB_KMeans, Dunn_KMeans = main_KMeans(15,df)

In [None]:
DB_KMeans

In [None]:
Dunn_KMeans

### For the dataset with 5 columns ('Thyroid')

In [11]:
df = pd.read_csv('thyroid_clean.csv')
N = len(df)
ordered_result = compute_dist(df)

def main_KMeans(k, df):
    
    centroids = initiate_centroids(k, df)
    df['Cluster'] = findClosestCentroids(centroids, df)
    cluster_dict = get_cluster_dict_5(df)
    DB_KMeans = Davies_Bouldin(cluster_dict,df)
    Dunn_KMeans = dunn(ordered_result, cluster_dict)
    
    return DB_KMeans, Dunn_KMeans

In [None]:
n_clusters = []
DB_KMeans = []
Dunn_KMeans = []
for K in range(2, N-1, 10):
    n_clusters.append(K)
    DB_KMeans.append(main_KMeans(K, df)[0])
    Dunn_KMeans.append(main_KMeans(K,df)[1])

In [None]:
import seaborn as sns
sns.lineplot(n_clusters,DB_KMeans)
plt.show()
sns.lineplot(n_clusters,Dunn_KMeans)
plt.show()