In [2]:
from keras.datasets import mnist
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.cluster import rand_score


In [1]:
!pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 1.7 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-1.0.1 threadpoolctl-3.0.0


In [3]:
from sklearn import metrics

def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [4]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [5]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(x_train.min())
print(x_train.max())
print(type(x_train))

(60000, 28, 28)
(10000, 28, 28)
(60000,)
(10000,)
0
255
<class 'numpy.ndarray'>


In [6]:
x_train = x_train.astype('float32') 
x_test = x_test.astype('float32')
# Normalization
x_train = x_train/255.0
x_test = x_test/255.0

In [7]:
X_train = x_train.reshape(len(x_train),-1)
X_test = x_test.reshape(len(x_test),-1)
print(X_train.shape)
print(X_test.shape)

(60000, 784)
(10000, 784)


k means

In [None]:
from sklearn.cluster import KMeans

# kmeanModel1 = KMeans(n_clusters=10,init = 'k-means++')
kmeanModel1 = KMeans(n_clusters=10,n_init=5,max_iter=100,random_state=55,algorithm='auto',verbose=0)
kmeanModel1.fit(X_train)

KMeans(max_iter=100, n_clusters=10, n_init=5, random_state=55)

In [None]:
y_kmeans1 = kmeanModel1.predict(X_train)
centers = kmeanModel1.cluster_centers_

In [None]:
purity_score(y_train,y_kmeans1)

0.5910166666666666

In [None]:
rand_score(y_kmeans1,y_train)

0.879543699061651

In [None]:
kmeanModel2 = KMeans(n_clusters=10,init = 'random')
kmeanModel2.fit(X_train)

KMeans(init='random', n_clusters=10)

In [None]:
y_kmeans2 = kmeanModel2.predict(X_train)
centers = kmeanModel2.cluster_centers_

In [None]:
purity_score(y_train,y_kmeans2)

0.5909666666666666

In [None]:
rand_score(y_kmeans2,y_train)

0.8795313488558143

K median


In [None]:
!pip install scikit-learn-extra



Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 28.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 23.5 MB/s eta 0:00:01[K     |▋                               | 30 kB 15.3 MB/s eta 0:00:01[K     |▊                               | 40 kB 14.8 MB/s eta 0:00:01[K     |█                               | 51 kB 5.5 MB/s eta 0:00:01[K     |█▏                              | 61 kB 6.0 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.6 MB/s eta 0:00:01[K     |█▌                              | 81 kB 6.2 MB/s eta 0:00:01[K     |█▊                              | 92 kB 6.1 MB/s eta 0:00:01[K     |██                              | 102 kB 5.3 MB/s eta 0:00:01[K     |██                              | 112 kB 5.3 MB/s eta 0:00:01[K     |██▎                             | 122 kB 5.3 MB/s eta 0:00:01[K     |██▌                    

In [None]:
from sklearn_extra.cluster import KMedoids
import numpy as np

In [None]:
kmedoids = KMedoids(n_clusters=10, random_state=10).fit(X_train)
kmedoids.labels_


In [None]:
array([0, 0, 0, 1, 1, 1])
kmedoids.predict([[0,0], [4,4]])
array([0, 1])
kmedoids.cluster_centers_
array([[1, 2],
       [4, 2]])
kmedoids.inertia_

mean shift

In [8]:
from sklearn.cluster import MeanShift
import numpy as np


In [16]:
clustering = MeanShift(bandwidth=10,bin_seeding = True).fit(X_train)
clustering.labels_


array([0, 0, 0, ..., 0, 0, 0])

In [17]:
purity_score(y_train,clustering.labels_)

0.11236666666666667

In [18]:
rand_score(clustering.labels_,y_train)

0.10027316399717773

DBSCAN

In [None]:
from sklearn.cluster import DBSCAN


In [None]:
clustering = DBSCAN(eps=2, min_samples=10).fit(X_train)
clustering.labels_

array([-1, -1, -1, ..., -1, -1, -1])

In [None]:
purity_score(y_train,clustering.labels_)

0.16733333333333333

In [None]:
rand_score(clustering.labels_,y_train)

0.20569828719367544

In [None]:
clustering1 = DBSCAN(eps=3, min_samples=10).fit(X_train)
clustering1.labels_

array([-1, -1, -1, ..., -1, -1, -1])

In [None]:
purity_score(y_train,clustering1.labels_)

0.20835

In [None]:
rand_score(clustering1.labels_,y_train)

0.286215336922282

In [None]:
clustering2 = DBSCAN(eps=5, min_samples=5).fit(X_train)
clustering2.labels_

array([-1,  0, -1, ...,  0,  0, -1])

In [None]:
purity_score(y_train,clustering2.labels_)

0.19828333333333334

In [None]:
rand_score(clustering2.labels_,y_train)

0.4836267743351278

In [None]:
clustering3 = DBSCAN(eps=8, min_samples=5).fit(X_train)


In [None]:
purity_score(y_train,clustering3.labels_)

0.11255

In [None]:
rand_score(clustering3.labels_,y_train)

0.10070263782174148

**Agglomerative**

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [None]:
clustering = AgglomerativeClustering(n_clusters= 10, linkage='single').fit(X_train)

In [None]:
clustering.labels_

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
purity_score(y_train,clustering.labels_)

0.11251666666666667

In [None]:
rand_score(clustering.labels_,y_train)

0.10051444190736512

In [None]:
clustering1 = AgglomerativeClustering(n_clusters= 10, linkage='average').fit(X_train)