In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering


In [2]:
def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X

X_public = read_data_from_csv('assignment_4_public.csv')
print('Shape of X_public:', X_public.shape)


# Our implement code
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X_public)




# export the result to csv
submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_4.csv', index=True, index_label='Id')

Shape of X_public: (3000, 784)


In [3]:
kmeans = KMeans(n_clusters=7, random_state=0, n_init=10)
# kmeans = KMeans(n_clusters=5, random_state=0)


kmeans.fit(X_normalized)
preds = kmeans.labels_

In [4]:
def run_silScore_Kmeans(start, end, x_data):

    for i in range(start, end+1):
        #  Run model
        kmeans = KMeans(n_clusters=i, random_state=0, n_init=10)
        kmeans.fit(x_data)
        preds = kmeans.labels_

        silhouette_avg = silhouette_score(x_data, preds)
        print("The Silhouette score of ",  i, " clustering is:", silhouette_avg)

In [5]:
run_silScore_Kmeans(2, 11, X_normalized)

The Silhouette score of  2  clustering is: 0.2021366100910641
The Silhouette score of  3  clustering is: 0.19218709632654576
The Silhouette score of  4  clustering is: 0.19619066770730031
The Silhouette score of  5  clustering is: 0.16340727177454903
The Silhouette score of  6  clustering is: 0.15726491876408735
The Silhouette score of  7  clustering is: 0.15263987653918293
The Silhouette score of  8  clustering is: 0.14060211178682744
The Silhouette score of  9  clustering is: 0.13517294682517436
The Silhouette score of  10  clustering is: 0.13822270938928588
The Silhouette score of  11  clustering is: 0.12990498463572428


In [6]:
# try hierarchical clustering
def run_silScore_Hierar(start, end, X_public):

    for i in range(start, end+1):
        #  Run model
        agg_clustering = AgglomerativeClustering(n_clusters=i)
        agg_clustering.fit(X_public)
        preds = agg_clustering.labels_

        silhouette_avg = silhouette_score(X_public, preds)
        print("The Silhouette score of ",  i, " clustering is:", silhouette_avg)


In [7]:
run_silScore_Hierar(2, 11, X_public)

The Silhouette score of  2  clustering is: 0.17139249952972613
The Silhouette score of  3  clustering is: 0.16806957137240286
The Silhouette score of  4  clustering is: 0.17827236111183653
The Silhouette score of  5  clustering is: 0.1593330984274979
The Silhouette score of  6  clustering is: 0.17328816455371748
The Silhouette score of  7  clustering is: 0.13028257047495287
The Silhouette score of  8  clustering is: 0.10917634085821112
The Silhouette score of  9  clustering is: 0.10563334060469272
The Silhouette score of  10  clustering is: 0.11205942065952319
The Silhouette score of  11  clustering is: 0.10770343636092493


For hierarchical clustering, 4 and 6 seem the best.<br>
And K-mean, 2, 4 and 5 are also good clusters

In [8]:
def comparsionIndex(num_cluster, x_data, preds):
    print("\n\nClustering result/score of ", num_cluster, ": ")
    # Calculate Silhouette Coefficient
    silhouette_avg = silhouette_score(x_data, preds)
    print("Silhouette Coefficient:", silhouette_avg)

    # Calculate Davies-Bouldin Index
    davies_bouldin_index = davies_bouldin_score(x_data, preds)
    print("Davies-Bouldin Index:", davies_bouldin_index)

    # Calculate Dunn Index
    distances = pairwise_distances(x_data)
    min_inter_cluster_distance = distance.cdist(x_data, x_data)[preds != preds[:, None]].min()
    max_intra_cluster_distance = distances[preds == preds[:, None]].max()
    dunn_index = min_inter_cluster_distance / max_intra_cluster_distance
    print("Dunn Index:", dunn_index)

Silhouette Coefficient: in between 1 and -1. It is better when closer to 1.

Davies-Bouldin Index: is the <b>lower</b> the better clustering.

Dunn Index: is the <b>higher</b> the better


In [9]:
# the scores of Kmeans
KM_clustering_list = [2, 3, 4, 5, 6, 7, 8, 9]

#  Run model
for i in KM_clustering_list:
        kmeans = KMeans(n_clusters=i, random_state=0, n_init=10)
        kmeans.fit(X_normalized)
        preds = kmeans.labels_
        comparsionIndex(i, X_normalized, preds)





Clustering result/score of  2 : 
Silhouette Coefficient: 0.2021366100910641
Davies-Bouldin Index: 1.7695002307827108
Dunn Index: 0.12236488554632793


Clustering result/score of  3 : 
Silhouette Coefficient: 0.19218709632654576
Davies-Bouldin Index: 1.7381384681926655
Dunn Index: 0.12051730760191069


Clustering result/score of  4 : 
Silhouette Coefficient: 0.19619066770730031
Davies-Bouldin Index: 1.824353488486126
Dunn Index: 0.12194701435017224


Clustering result/score of  5 : 
Silhouette Coefficient: 0.16340727177454903
Davies-Bouldin Index: 1.9229197080888256
Dunn Index: 0.14494718187781444


Clustering result/score of  6 : 
Silhouette Coefficient: 0.15726491876408735
Davies-Bouldin Index: 1.904549178087236
Dunn Index: 0.11391932734150427


Clustering result/score of  7 : 
Silhouette Coefficient: 0.15263987653918293
Davies-Bouldin Index: 1.8728248355981292
Dunn Index: 0.10018454147161485


Clustering result/score of  8 : 
Silhouette Coefficient: 0.14060211178682744
Davies-Bould

In [10]:
X_public = read_data_from_csv('assignment_4_public.csv')

# the scores of hierarchical clustering
Hier_clustering_list = [4, 6, 8, 9]

#  Run model
for i in Hier_clustering_list:
        agg_clustering = AgglomerativeClustering(n_clusters=i)
        agg_clustering.fit(X_public)
        preds = agg_clustering.labels_
        comparsionIndex(i, X_public, preds)



Clustering result/score of  4 : 
Silhouette Coefficient: 0.17827236111183653
Davies-Bouldin Index: 1.9264056895507702
Dunn Index: 0.16355222884327095


Clustering result/score of  6 : 
Silhouette Coefficient: 0.17328816455371748
Davies-Bouldin Index: 1.6770835203576937
Dunn Index: 0.17947883637871917


Clustering result/score of  8 : 
Silhouette Coefficient: 0.10917634085821112
Davies-Bouldin Index: 2.135054633829895
Dunn Index: 0.1818786931627627


Clustering result/score of  9 : 
Silhouette Coefficient: 0.10563334060469272
Davies-Bouldin Index: 2.2513469329628197
Dunn Index: 0.1818786931627627
