# Clustering k-means

In [None]:
from scipy.io import arff

f = open('data/diamond9.arff', 'r')
data, meta = arff.loadarff(f)

print(data[0])
print(data['x'])
print(data['class'])
#print(meta)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,15))

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple']

c_used = [colors[int(data[i]['class'])] for i in range(len(data))]

plt.scatter(data['x'], data['y'], c=c_used, s=10)

In [None]:
from sklearn.cluster import KMeans

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

kmeans = KMeans(n_clusters=9, random_state=0).fit(X)

#kmeans.labels_
#print(max(kmeans.labels_)) # 8 OK

#print(kmeans.cluster_centers_)

plt.figure(figsize=(20,15))

plt.scatter(data['x'], data['y'], c=c_used, s=10)
plt.scatter(kmeans.cluster_centers_[:,0],
            kmeans.cluster_centers_[:,1],
            c='black', s=500, marker='x')

In [None]:
data[0]
data[1]


In [None]:
kmeans.predict([[1, 0.0]])

In [None]:
from sklearn import metrics

labels_true = [int(data[i]['class']) for i in range(len(data))]

metrics.adjusted_rand_score(labels_true, kmeans.labels_)

In [None]:
import time

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []

for k in range(2, 20):
    start_time = time.process_time()
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    times.append(time.process_time() - start_time)
    
    values_silhouette.append(
        metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    )

    values_calinski.append(
        metrics.calinski_harabasz_score(X, kmeans.labels_)
    )
    values_bouldin.append(
        metrics.davies_bouldin_score(X, kmeans.labels_)
    )

In [None]:
plt.figure(figsize=(17, 3))

plt.subplot(1, 3, 1)

plt.title('Score avec silhouette')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_silhouette)

plt.subplot(1, 3, 2)

plt.title('Score avec Calinski')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_calinski)

plt.subplot(1, 3, 3)

plt.title('Score avec Bouldin')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_bouldin)


In [None]:
plt.title('Temps de recherche')
plt.xlabel('Nombre de clusters')
plt.ylabel('Temps d\'entraînement')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), times)
plt.show()

## Expérimentations avec d'autres formes

### Convexe / non séparés

In [None]:
from scipy.io import arff

f = open('data/elly.arff', 'r')
data, meta = arff.loadarff(f)

print(data[0])
print(data['x'])
print(data['class'])
#print(meta)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,15))

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple', 'darkblue']

c_used = [colors[int(data[i]['class'])] for i in range(len(data))]

plt.scatter(data['x'], data['y'], c=c_used, s=10)

In [None]:
from sklearn.cluster import KMeans

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

kmeans = KMeans(n_clusters=10, random_state=0).fit(X)

#kmeans.labels_
#print(max(kmeans.labels_)) # 8 OK

#print(kmeans.cluster_centers_)

plt.figure(figsize=(20,15))

plt.scatter(data['x'], data['y'], c=c_used, s=10)
plt.scatter(kmeans.cluster_centers_[:,0],
            kmeans.cluster_centers_[:,1],
            c='black', s=500, marker='x')

In [None]:
kmeans.predict([[1, 0.0]])

In [None]:
from sklearn import metrics

labels_true = [int(data[i]['class']) for i in range(len(data))]

metrics.adjusted_rand_score(labels_true, kmeans.labels_)

In [None]:
import time

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []

for k in range(2, 20):
    start_time = time.process_time()
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    times.append(time.process_time() - start_time)
    
    values_silhouette.append(
        metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    )

    values_calinski.append(
        metrics.calinski_harabasz_score(X, kmeans.labels_)
    )
    values_bouldin.append(
        metrics.davies_bouldin_score(X, kmeans.labels_)
    )
    



In [None]:
plt.figure(figsize=(17, 3))

plt.subplot(1, 3, 1)


plt.title('Score avec silhouette')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_silhouette)

plt.subplot(1, 3, 2)

plt.title('Score avec Calinski')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_calinski)


plt.subplot(1, 3, 3)

plt.title('Score avec Bouldin')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_bouldin)

### Convexe / bien séparés / densités non similaires / bruité

In [None]:
from scipy.io import arff

f = open('data/CURE.arff', 'r')
data, meta = arff.loadarff(f)

print(data[0])
print(data['x'])
print(data['class'])
#print(meta)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,15))

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta']

c_used = [colors[6 if data[i]['class'].decode("utf-8") == 'noise' else int(data[i]['class'])] for i in range(len(data))]

plt.scatter(data['x'], data['y'], c=c_used, s=10)

In [None]:
from sklearn.cluster import KMeans

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

kmeans = KMeans(n_clusters=6, random_state=0).fit(X)

#kmeans.labels_
#print(max(kmeans.labels_)) # 8 OK

#print(kmeans.cluster_centers_)

plt.figure(figsize=(20,15))

plt.scatter(data['x'], data['y'], c=c_used, s=10)
plt.scatter(kmeans.cluster_centers_[:,0],
            kmeans.cluster_centers_[:,1],
            c='black', s=500, marker='x')

In [None]:
kmeans.predict([[1, 0.0]])

In [None]:
from sklearn import metrics

labels_true = [6 if data[i]['class'].decode("utf-8") == 'noise' else int(data[i]['class']) for i in range(len(data))]

metrics.adjusted_rand_score(labels_true, kmeans.labels_)

In [None]:
import time

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []

for k in range(2, 20):
    start_time = time.process_time()
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    times.append(time.process_time() - start_time)
    
    values_silhouette.append(
        metrics.silhouette_score(X, kmeans.labels_, metric='euclidean')
    )

    values_calinski.append(
        metrics.calinski_harabasz_score(X, kmeans.labels_)
    )
    values_bouldin.append(
        metrics.davies_bouldin_score(X, kmeans.labels_)
    )
    
    

In [None]:
plt.figure(figsize=(17, 3))

plt.subplot(1, 3, 1)


plt.title('Score avec silhouette')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_silhouette)

plt.subplot(1, 3, 2)

plt.title('Score avec Calinski')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_calinski)


plt.subplot(1, 3, 3)

plt.title('Score avec Bouldin')
plt.xlabel('Nombre de clusters')
plt.ylabel('Score')
plt.grid(color='#dddddd', linestyle='-', linewidth=1)
plt.xticks(range(2,20))
plt.plot(range(2,20), values_bouldin)

Penser à tester avec des formes non convexes pour voir si ça influe le résultat. 

# Clustering agglométarif

In [None]:
from scipy.io import arff

# CURE, diamond9, spiral
f = open('data/spiral.arff', 'r')
data, meta = arff.loadarff(f)

print(data[0])
print(data['x'])
print(data['class'])
#print(meta)

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import numpy as np

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple']
c_used = [colors[6 if data[i]['class'].decode("utf-8") == 'noise' else int(data[i]['class'])] for i in range(len(data))]

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

agglo = AgglomerativeClustering(n_clusters=9, linkage='ward').fit(X)

plt.figure(figsize=(20,15))


children = agglo.children_

# Distances between each pair of children
# Since we don't have this information, we can use a uniform one for plotting
distance = np.arange(children.shape[0])

# The number of observations contained in each cluster level
no_of_observations = np.arange(2, children.shape[0]+2)

# Create linkage matrix and then plot the dendrogram
linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

In [None]:
# Plot the corresponding dendrogram
plt.figure(figsize=(20,20))
dendrogram(linkage_matrix)

In [None]:
colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(20,20))

for k in range(2, 13):
    agglo = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(X)
    c_used = [colors[agglo.labels_[i]] for i in range(len(data))]

    
    plt.subplot(5, 3, k-1)
    plt.scatter(data['x'], data['y'], c=c_used, s=10)

diamond9 > nickel (0.99)

CURE > Ca marche plutôt bien sauf pour le gros qui est toujours divisé (même problème que k-means)
Le bruit est rattaché à un cluster

Spiral > dégueu (0.01)

In [None]:
from sklearn import metrics

#labels_true = [6 if data[i]['class'].decode("utf-8") == 'noise' else int(data[i]['class']) for i in range(len(data))]
labels_true = [int(data[i]['class']) for i in range(len(data))]

metrics.adjusted_rand_score(labels_true, agglo.labels_)

## Test de différentes méthodes de linkage

In [None]:
from scipy.io import arff

f = open('data/aggregation.arff', 'r')
data, meta = arff.loadarff(f)

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

linkages = ['ward', 'complete', 'average', 'single']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,15))

i = 1
for linkage in linkages:
    agglo = AgglomerativeClustering(n_clusters=7, linkage=linkage).fit(X)
    c_used = [colors[agglo.labels_[i]] for i in range(len(data))]

    
    plt.subplot(5, 3, i)
    plt.scatter(data['x'], data['y'], c=c_used, s=10)
    plt.title(linkage)
    i += 1
    
    

    
    
    
f = open('data/diamond9.arff', 'r')
data, meta = arff.loadarff(f)

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

linkages = ['ward', 'complete', 'average', 'single']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,15))

i = 1
for linkage in linkages:
    agglo = AgglomerativeClustering(n_clusters=9, linkage=linkage).fit(X)
    c_used = [colors[agglo.labels_[i]] for i in range(len(data))]

    
    plt.subplot(5, 3, i)
    plt.scatter(data['x'], data['y'], c=c_used, s=10)
    plt.title(linkage)
    i += 1

    

    
    
    
f = open('data/CURE.arff', 'r')
data, meta = arff.loadarff(f)

colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

linkages = ['ward', 'complete', 'average', 'single']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,15))

i = 1
for linkage in linkages:
    agglo = AgglomerativeClustering(n_clusters=6, linkage=linkage).fit(X)
    c_used = [colors[agglo.labels_[i]] for i in range(len(data))]

    
    plt.subplot(5, 3, i)
    plt.scatter(data['x'], data['y'], c=c_used, s=10)
    plt.title(linkage)
    i += 1

TODO : Calculer les scores

# Clustering DBSCAN

In [None]:
from scipy.io import arff
from sklearn.cluster import DBSCAN

f = open('data/diamond9.arff', 'r')
data, meta = arff.loadarff(f)


colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,15))

j = 1
for eps in [0.01, 0.1, 0.2, 0.35, 0.5]:
    dbscan = DBSCAN(eps=eps).fit(X)
    
    plt.subplot(2, 3, j)
    plt.scatter(data['x'], data['y'], c=dbscan.labels_, s=10)
    plt.title("Avec eps=" + str(eps) + " (" + str(max(dbscan.labels_ + 1)) + " clusters trouvés)")
    j += 1

In [None]:
from scipy.io import arff
from sklearn.cluster import DBSCAN

f = open('data/diamond9.arff', 'r')
data, meta = arff.loadarff(f)


colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,50))

j = 1

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []

for eps in range(5, 16):
    eps = eps / 100
    
    for min_samples in range(2, 15):
        start_time = time.process_time()
        dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
        times.append(time.process_time() - start_time)
        
        if max(dbscan.labels_) >= 1:
            values_silhouette.append(
                metrics.silhouette_score(X, dbscan.labels_, metric='euclidean')
            )
            values_calinski.append(
                metrics.calinski_harabasz_score(X, dbscan.labels_)
            )
            values_bouldin.append(
                metrics.davies_bouldin_score(X, dbscan.labels_)
            )
        else:
            values_silhouette.append(0)
            values_calinski.append(0)
            values_bouldin.append(0)
        
        score = max(dbscan.labels_) + 1
        
        if (score >= 8 and score <= 10):
            print("eps : " + str(eps) + " / min_s = " + str(min_samples) + " / clusters trouvés : " + str(score))
            print(dbscan.labels_)
            plt.subplot(10, 3, j)
            #plt.scatter(data['x'], data['y'], c=dbscan.labels_, s=10)
            c_used = [colors[dbscan.labels_[i]] for i in range(len(data))]
            plt.scatter(data['x'], data['y'], c=c_used, s=10)
            plt.title("eps=" + str(eps) + " / min_samples=" + str(min_samples) + " (" + str(max(dbscan.labels_ + 1)) + " clusters trouvés)")
            j += 1


In [None]:
from mpl_toolkits import mplot3d

zline = np.linspace(0, max(values_silhouette), 1000)
xline = np.linspace(0.05, 0.16, 1000)
yline = np.linspace(1, 15, 1000)

zdata = values_silhouette
xdata = range(5, 16)
ydata = range(2, 15)

ax = plt.axes(projection='3d')
#ax.view_init(30, 45)
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap="Greens")

In [None]:
from mpl_toolkits.mplot3d import Axes3D

Axes3D.plot(range(5, 16), range(2, 15), values_silhouette)

On retient eps=0.15 et min_samples=9

On retrouve bien nos 9 clusters, et peu de points restent non classés

In [None]:
from scipy.io import arff
from sklearn.cluster import DBSCAN

f = open('data/spiral.arff', 'r')
data, meta = arff.loadarff(f)


colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', 'steelblue']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

plt.figure(figsize=(15,50))

j = 1

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []

for eps in range(1, 110):
    eps = eps / 100
    
    for min_samples in range(2, 150):
        start_time = time.process_time()
        dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
        times.append(time.process_time() - start_time)
        
        if max(dbscan.labels_) >= 1:
            values_silhouette.append(
                metrics.silhouette_score(X, dbscan.labels_, metric='euclidean')
            )
            values_calinski.append(
                metrics.calinski_harabasz_score(X, dbscan.labels_)
            )
            values_bouldin.append(
                metrics.davies_bouldin_score(X, dbscan.labels_)
            )
        else:
            values_silhouette.append(0)
            values_calinski.append(0)
            values_bouldin.append(0)
        
        score = max(dbscan.labels_) + 1
        
        printed = 0
        if (score == 3 and printed <= 5):
            print("eps : " + str(eps) + " / min_s = " + str(min_samples) + " / clusters trouvés : " + str(score))
            plt.subplot(10, 3, j)
            #plt.scatter(data['x'], data['y'], c=dbscan.labels_, s=10)
            c_used = [colors[dbscan.labels_[i]] for i in range(len(data))]
            plt.scatter(data['x'], data['y'], c=c_used, s=10)
            plt.title("eps=" + str(eps) + " / min_samples=" + str(min_samples) + " (" + str(max(dbscan.labels_ + 1)) + " clusters trouvés)")
            j += 1
            printed += 1
            print(printed)


Dire que les métriques sont pas vraiment utiles ici

A tester sur les spirales (ça marche bien 1/5)

# HDBSCAN

In [None]:
import hdbscan

f = open('data/diamond9.arff', 'r')
data, meta = arff.loadarff(f)


colors = ['red', 'blue', 'gray', 'cyan', 'green',
          'brown', 'magenta', 'orange', 'purple',
          'darkblue', 'salmon', '#000000']

X = [[data[i]['x'], data[i]['y']] for i in range(len(data))]

#plt.figure(figsize=(15,50))

values_silhouette = []
values_calinski = []
values_bouldin = []

times = []


start_time = time.process_time()
hdbscan_labels = hdbscan.HDBSCAN(min_cluster_size=10).fit_predict(X)
times.append(time.process_time() - start_time)


c_used = [colors[hdbscan_labels[i]] for i in range(len(data))]
plt.scatter(data['x'], data['y'], c=c_used, s=10)
plt.title(str(max(hdbscan_labels + 1)) + " clusters trouvés)")
j += 1