# Optimization projec: clustering

In [1]:
import numpy as np
from numpy import random
import pandas as pd
from sklearn.neighbors import DistanceMetric
import functools
import operator

__Given that we don't have the proper datafiles to run the code, we will use synthetic ones to still do the work, and
as soon as we have the real files we will just substitute them.__

In [2]:
N=200

In [3]:
#Simulate dataframe
X1 = np.random.uniform(low=0, high=25, size=(N,))
X2 = np.random.uniform(low=5, high=30, size=(N))
df = pd.DataFrame({'X1': X1, 'X2': X2}, columns=['X1', 'X2'])

# Minimum Spanning Tree Clustering


## naive approach: compute all distances:

In [23]:
x1 = df.X1.values
x2 = df.X2.values

In [24]:
result = []
for i in range(len(x1)):
    for j in range(i+1,len(x1)):
        d = np.linalg.norm(np.array([x1[i], x2[i]]) - np.array([x1[j], x2[j]]))
        result.append([i, j, d])

ordered_result = sorted(result, key=lambda t: t[::-1])

## MST

To find the minimum spanning tree it makes sense to differentiate the following cases:

    case 1: None of them are in a cluster:
    case 2: Only one of them is already in a cluster:
    case 3: Both of them are already in a cluster:
        --> 3a: Both of them are in the same cluster: do nothing
        --> 3b: They are in different clusters: merge the 2 corresponding clusters


In [25]:
ordered_result

[[47, 128, 0.11117105681731983],
 [100, 101, 0.12791363941977027],
 [40, 170, 0.13655647108645047],
 [162, 181, 0.1429270634991497],
 [19, 169, 0.21790540717753648],
 [26, 76, 0.2200343126436164],
 [24, 52, 0.2265581004839222],
 [52, 178, 0.24010665095725223],
 [37, 78, 0.2605302002604792],
 [90, 94, 0.2977652032927554],
 [27, 146, 0.31123610085087927],
 [98, 112, 0.3126431961550898],
 [51, 108, 0.31548294602291754],
 [143, 179, 0.3164715911363514],
 [16, 150, 0.3329165844024154],
 [45, 161, 0.33331122899315185],
 [149, 184, 0.3474376939132938],
 [139, 191, 0.36562822139951934],
 [58, 147, 0.3719192553721335],
 [4, 145, 0.38057658644401926],
 [24, 178, 0.38067075249471133],
 [42, 139, 0.3958148907118636],
 [61, 148, 0.4550163361532491],
 [2, 111, 0.46201641305997765],
 [88, 125, 0.47600888993968293],
 [9, 96, 0.47945869120029994],
 [67, 103, 0.5062262590809872],
 [82, 130, 0.5108998268531122],
 [63, 110, 0.5221397716833585],
 [5, 39, 0.5403981455510716],
 [91, 175, 0.5414252471294938],

In [26]:
def MST_clustering(K, ordered_result):
    cluster_dict = {}
    cluster_count = 1
    edge_count = 0
    visited = []
    
    
    for ite in ordered_result:
        # termination condition: if we are at K clusters, we just add the unvisited vertices as lone clusters
        if edge_count == 200 - K:
            for i in range(2):
                if ite[i] not in visited:
                    cluster_dict[cluster_count] = [ite[i]]
                    visited.append(ite[i])
                    cluster_count += 1
        # case 1:
        elif ite[0] not in visited and ite[1] not in visited:
            cluster_dict[cluster_count] = ite[:2]
            [visited.append(n) for n in ite[:2]]
            cluster_count += 1
            edge_count += 1
        # case 2:
        elif ite[0] in visited and ite[1] not in visited:
            for key, v in cluster_dict.items():
                if ite[0] in v:
                    cluster_dict[key].append(ite[1])
                    visited.append(ite[1])
                    edge_count += 1
        # case 2b: The other one:
        elif ite[1] in visited and ite[0] not in visited:
            for key, v in cluster_dict.items():
                if ite[1] in v:
                    cluster_dict[key].append(ite[0])
                    visited.append(ite[0])
                    edge_count += 1
        # case 3:
        elif ite[0] in visited and ite[1] in visited:
            for num in range(2):
                for key, v in cluster_dict.items():
                    if ite[num] in v:
                        # case 3: They are in different clusters: merge the 2 corresponding clusters
                        if ite[1-num] not in v:
                            if num == 0:
                                #cluster_dict[key].append(ite[num])
                                store_cluster = cluster_dict[key].copy()
                                cluster_dict[key] = []
                            elif num == 1:
                                [cluster_dict[key].append(n) for n in store_cluster]
                                edge_count += 1
                                
    # drop the keys where values were added to another key due to merging two clusters
    cluster_dict = {k: v for k, v in cluster_dict.items() if v != []}
    return cluster_dict

In [27]:
%%time
cluster_dict = MST_clustering(4, ordered_result)
len(cluster_dict)

CPU times: user 89.9 ms, sys: 3.22 ms, total: 93.1 ms
Wall time: 95.3 ms


4

In [28]:
ind_list = []
clust_list = []

for k, v in cluster_dict.items():
    [clust_list.append(k) for i in v]
    [ind_list.append(val) for val in v]
    
df_s = pd.DataFrame()
df_s['index'] = ind_list
df_s['cluster'] = clust_list

In [29]:
df_s = df_s.sort_values('index').set_index('index')

In [30]:
df_s['x1'] = x1
df_s['x2'] = x2

In [31]:
df_s

Unnamed: 0_level_0,cluster,x1,x2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9,10.602915,21.981817
1,9,17.758281,25.431478
2,9,8.059951,7.790991
3,9,7.823131,17.996177
4,9,13.622382,10.415951
...,...,...,...
195,9,11.099997,12.404798
196,9,1.892140,10.492395
197,9,12.077236,28.058176
198,9,12.891923,13.802969


In [35]:
#df_s.cluster = df_s.cluster.astype(str)

In [36]:
#import seaborn as sns
#sns.scatterplot(data=df_s, x="x1", y="x2", hue="cluster")

In [59]:
def max_within(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[-i-1][0] in v and ordered_result[-i-1][1] in v:
                return ordered_result[-i-1][2]

def min_between(ordered_result, cluster_dict):
    for i in range(len(ordered_result)):
        for k,v in cluster_dict.items():
            if ordered_result[i][0] in v and ordered_result[i][1] not in v:
                return ordered_result[i][2]

def dunn(num, den):

    return num/den

In [56]:
num = max_within(ordered_result, cluster_dict)
den = min_between(ordered_result, cluster_dict)
dunn(num, den)

11.341491098730694