In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
mnist.target = mnist.target.astype(np.uint8)
X = mnist['data']
y = mnist['target']

In [2]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
sil_scores = []
lista = []

for i in range(8,13):
    kmeans = KMeans(n_clusters = i, random_state = 42, n_init = 10)
    y_pred = kmeans.fit_predict(X)
    lista.append([kmeans, y_pred])
    sil_scores.append(silhouette_score(X, y_pred))

print(sil_scores)

[0.07337977998298922, 0.05681625379289227, 0.0586915389505002, 0.05835878745275728, 0.05817356340885259]


In [4]:
import pickle
with open('kmeans_sil.pkl', 'wb') as f:
    pickle.dump(sil_scores, f)

In [5]:
from sklearn.metrics import confusion_matrix

In [6]:
predictions = []
for k in lista:
    predictions.append(k[1])

cm = confusion_matrix(y, predictions[2])
sett = set()
for row in cm:
    sett.add(np.argmax(row))

print(sett)
with open('kmeans_argmax.pkl', 'wb') as f:
    pickle.dump(list(sett), f)

{0, 1, 2, 3, 5, 6, 8, 9}


In [7]:
lengths = []
for p in range(300):
    for point in X: 
        lng = np.linalg.norm(X[p] - point)
        if lng!=0:
            lengths.append(lng)

lengths.sort()

    

In [8]:
pickler = lengths[:10]
print(pickler)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [9]:
with open('dist.pkl', 'wb') as f:
    pickle.dump(pickler, f)

In [14]:
s = (pickler[0]+pickler[1]+pickler[2])/3
print(s)

300.40909845916684


In [28]:
from sklearn.cluster import DBSCAN
unique_labels = []
for ep in np.arange(s, s+0.1*s, 0.04*s):
    dbscan = DBSCAN(eps=ep)
    dbscan.fit(X)
    print(dbscan)
    print(dbscan.labels_)
    print(ep)
    unique_labels.append(np.unique(dbscan.labels_[dbscan.labels_ != -1]))

DBSCAN(eps=300.40909845916684)
[-1 -1 -1 ... -1 -1 -1]
300.40909845916684
DBSCAN(eps=312.4254623975335)
[-1 -1 -1 ... -1 -1 -1]
312.4254623975335
DBSCAN(eps=324.4418263359002)
[-1 -1 -1 ... -1 -1 -1]
324.4418263359002


In [31]:
unique_numbers = []
unique_numbers.append(len(unique_labels[0])+1)
unique_numbers.append(len(unique_labels[1])+1)
unique_numbers.append(len(unique_labels[2])+1)
print(unique_numbers)
with open('dbscan_len.pkl', 'wb') as f:
    pickle.dump(unique_numbers, f)

[4, 7, 22]


In [36]:
with open('kmeans_sil.pkl', 'rb') as f:
    # Load the object from the file
    my_list = pickle.load(f)


print(my_list)

[0.07337977998298922, 0.05681625379289227, 0.0586915389505002, 0.05835878745275728, 0.05817356340885259]
