In [33]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import sys
import os
import random
import time
import copy
import matplotlib.pyplot as plt
from scipy import stats
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.cluster.hierarchy import   linkage, single, complete, average, dendrogram, cut_tree, inconsistent
import matplotlib.pyplot as plt
import copy
from PIL import Image
colors = ['aquamarine', 'g', 'r', 'coral', 'm', 'khaki', 'turquoise', 'navy', 'yellowgreen', 'pink']
digitsFile = 'digits-embedding.csv'
digitsData = pd.read_csv('digits-embedding.csv', header=None, names=['id', 'label', 'featureX', 'featureY'])

digits = digitsData.groupby(digitsData.iloc[:, 1]).apply(lambda x: x.sample(n=10)).reset_index(drop=True)


In [34]:
def hierarchicalClustering(data, method):
    dataFeatures = data.ix[:, [2, 3]]
    distances = pdist(dataFeatures)
    if method == 1:
        clusters = single(distances)
    elif method == 2:
        clusters = complete(distances)
    else:
        clusters = average(distances)

    dendrogram(clusters)
    plt.savefig('Fig-Dendogram'+str(method)+'.png', dpi=100)
    plt.close()
    #plt.show()
np.random.seed(0)
colors = ['aquamarine', 'g', 'r', 'coral', 'm', 'khaki', 'turquoise', 'navy', 'yellowgreen', 'pink']
digits = digitsData.groupby(digitsData.iloc[:, 1]).apply(lambda x: x.sample(n=10))
hierarchicalClustering(digits,1)
hierarchicalClustering(digits,2)
hierarchicalClustering(digits,3)

In [37]:
#colors = ['aquamarine', 'g', 'r', 'coral', 'm', 'khaki', 'turquoise', 'navy', 'yellowgreen', 'pink']
digitsFile = 'digits-embedding.csv'
digitsData = pd.read_csv('digits-embedding.csv', header=None, names=['id', 'label', 'featureX', 'featureY'])

digits = digitsData.groupby(digitsData.iloc[:, 1]).apply(lambda x: x.sample(n=10)).reset_index(drop=True)

def getClusters(z,clusters,level):
    numClusters = len(clusters) 
    for i in range(level):
        cluster1 = z[i][0]
        cluster2 = z[i][1]
        clusters[clusters == cluster1] = numClusters
        clusters[clusters == cluster2] = numClusters
        numClusters += 1           
    uniqueClusters = list(set(clusters))    
    for i in range(len(uniqueClusters)):
        clusters[clusters == uniqueClusters[i]] = i
    return clusters

def hierarchicalClusteringPartitions(data, method, K):
    dataFeatures = data.ix[:, [2, 3]]
    distancesToOtherPoints = pdist(dataFeatures)
    if method == 1:
        z = single(distancesToOtherPoints)
    elif method == 2:
        z = complete(distancesToOtherPoints)
    else:
        z = average(distancesToOtherPoints)
    clusters = np.array(range(len(data)))
    level = len(data) - K
    assignedClusters = getClusters(z,clusters,level)  
    data['cluster'] = assignedClusters
    clusterAverages = {}    
    uniqueAssignedClusters = list(set(assignedClusters))    
    for cluster in range(len(uniqueAssignedClusters)):  
        currentCluster = uniqueAssignedClusters[cluster]  
        clusterData = data[data['cluster'] == currentCluster]        
        clusterAverages[cluster] = clusterData.ix[:,[2,3]].mean()         
    distanceToCentroid = [0]*len(data)       
    i = 0
    for index, row in data.iterrows():
        currentCluster = row[4]
        distanceToCentroid[i] = np.linalg.norm((np.array(row[2:4]))-(np.array(clusterAverages[currentCluster]))) 
        i = i + 1                   
    data['distanceToCentroid'] = distanceToCentroid
    return data

def withinClusterSSD(data):
    data['squaredDistance'] = data['distanceToCentroid'] * data['distanceToCentroid']
    return data['squaredDistance'].sum()		

# silhouette coeeficient
def silhouetteCoefficient(data):
    dataFeatures = data.ix[:, [2,3]]
    distancesToOtherPoints = squareform(pdist(dataFeatures))
    A = np.zeros(len(data))
    B = np.zeros(len(data))
    S = np.zeros(len(data))
    for i in range(len(data)):
        cluster = data.loc[i, 'cluster']
        pointsInCluster = data[data['cluster'] == cluster].index.tolist()
        pointsOutCluster = data[data['cluster'] != cluster].index.tolist()
        A[i] = np.mean(distancesToOtherPoints[i][pointsInCluster])
        B[i] = np.mean(distancesToOtherPoints[i][pointsOutCluster])
        S[i] = (B[i]-A[i])/max(A[i], B[i])
    return np.mean(S)
wcSSDMethod1 = list()
wcSSDMethod2 = list()
wcSSDMethod3 = list()
SCMethod1 = list()
SCMethod2 = list()
SCMethod3 = list()

for K in [2, 4, 8, 16, 32]:
    for method in [1, 2, 3]:
        data = hierarchicalClusteringPartitions(digits, method, K)
        wcSSD = withinClusterSSD(data)
        SC = silhouetteCoefficient(data)
        if method == 1:
            wcSSDMethod1.append(wcSSD)
            SCMethod1.append(SC)
        elif method == 2:
            wcSSDMethod2.append(wcSSD)
            SCMethod2.append(SC)
        else:
            wcSSDMethod3.append(wcSSD)
            SCMethod3.append(SC)
        print('WC-SSD for K ', K, ', linkage method', method, ':', wcSSD)
        print('SC for K ', K, ', linkage method', method, ':',SC)

x = [i for i in [2, 4, 8, 16, 32]]
plt.figure()
plt.plot(x, wcSSDMethod1, label='WC-SSD Single Linkage')
plt.plot(x, wcSSDMethod2, label='WC-SSD Complete Linkage')
plt.plot(x, wcSSDMethod3, label='WC-SSD Average Linkage')
plt.title("WC-SSD Vs K")
plt.xlabel('K')
plt.ylabel('WC-SSD')
plt.legend()
plt.show()
plt.close()

plt.figure()
plt.plot(x, SCMethod1, label='SC Single Linkage')
plt.plot(x, SCMethod2, label='SC Complete Linkage')
plt.plot(x, SCMethod3, label='SC Average Linkage')
plt.title("SC Vs K")
plt.xlabel('K')
plt.ylabel('SC')
plt.legend()
plt.show()

('WC-SSD for K ', 2, ', linkage method', 1, ':', 67532.50646625638)
('SC for K ', 2, ', linkage method', 1, ':', 0.139347095970045)
('WC-SSD for K ', 2, ', linkage method', 2, ':', 47698.17834201708)
('SC for K ', 2, ', linkage method', 2, ':', 0.3446204547541821)
('WC-SSD for K ', 2, ', linkage method', 3, ':', 44078.66310893039)
('SC for K ', 2, ', linkage method', 3, ':', 0.391194188349029)
('WC-SSD for K ', 4, ', linkage method', 1, ':', 28166.144319143412)
('SC for K ', 4, ', linkage method', 1, ':', 0.5207556352610638)
('WC-SSD for K ', 4, ', linkage method', 2, ':', 21106.114180883193)
('SC for K ', 4, ', linkage method', 2, ':', 0.5594660878024913)
('WC-SSD for K ', 4, ', linkage method', 3, ':', 20393.925821631063)
('SC for K ', 4, ', linkage method', 3, ':', 0.5678304267553059)
('WC-SSD for K ', 8, ', linkage method', 1, ':', 21583.347914359736)
('SC for K ', 8, ', linkage method', 1, ':', 0.5931146332107271)
('WC-SSD for K ', 8, ', linkage method', 2, ':', 9179.677748569495)

KeyError: 31.0