In [26]:
from sklearn.datasets import fetch_mldata
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math
import pandas as pd

In [2]:
mnist = fetch_mldata('MNIST original')
normalized_dataset = np.divide(mnist.data,255)
mnist_data = normalized_dataset
mnist_target_data = mnist.target

In [3]:
mnist_data_sample = mnist_data[:10000]
mnist_target_sample = mnist_target_data[:10000]

In [22]:
def get_purity(clusters_label,N):
    purity=0.0
    for each_centroid in clusters_label:
        cluster_targets = clusters_label[each_centroid]
        frequency=Counter(cluster_targets)
        key_max=max(cluster_targets, key=cluster_targets.count)
        purity += frequency[key_max]
    purity = purity/N
    return purity

def calculate_purity(labels,target,N):
    clusters_label = {}
    label_len = N
    for i in range(N):
        if(labels[i]==-1):
            label_len=label_len-1
            continue
        clusters_label.setdefault(labels[i],[]).append(target[i])
    
    return get_purity(clusters_label,label_len)

### sum(m[i][j]) where 0<=i<=number of true clusters
def get_Mj(m,j):
    Mj=0.0
    for i in range(m.shape[0]):
        if(m[i][j]==0.0):
            continue
            
        Mj += m[i][j]
    return Mj

def column_gini_index(m,j):
    Gj=0.0
    Mj = get_Mj(m,j)
    for i in range(m.shape[0]):
        if(m[i][j]==0.0):
            continue
        Gj+=math.pow(m[i][j]/Mj,2)
    return 1-Gj

def get_gini_index(m):
    gini_index = 0.0
    gini_index_denominator = 0.0
    gini_index_numerator = 0.0
    for j in range(m.shape[1]):
        Mj = get_Mj(m,j)
        gini_index_denominator += Mj
        gini_index_numerator += column_gini_index(m,j)*Mj
    gini_index = gini_index_numerator/gini_index_denominator
    return gini_index

def calculate_gini(labels,target,tk,td,N):
    m = np.zeros((tk,td))
    
    for i in range(N):
        #if(labels[i]==-1):
        #    continue
        m[int(target[i])][int(labels[i])] += 1
    
    return get_gini_index(m)




In [5]:
x = AgglomerativeClustering(n_clusters=10)
pred_labels = x.fit_predict(mnist_data_sample)

In [24]:
print("purity:",calculate_purity(pred_labels,mnist_target_sample,mnist_data_sample.shape[0]))
print("gini index:",calculate_gini(pred_labels,mnist_target_sample,len(set(mnist_target_sample)),len(set(pred_labels)),mnist_data_sample.shape[0]))

purity: 0.9991
gini index: 0.0017953892241567388


In [25]:
x = AgglomerativeClustering(n_clusters=20)
pred_labels = x.fit_predict(mnist_data_sample)

In [27]:
print("purity:",calculate_purity(pred_labels,mnist_target_sample,mnist_data_sample.shape[0]))
print("gini index:",calculate_gini(pred_labels,mnist_target_sample,len(set(mnist_target_sample)),len(set(pred_labels)),mnist_data_sample.shape[0]))

purity: 0.9991
gini index: 0.0017945466335888875
