In [84]:
import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, accuracy_score, classification_report

In Scikit Learn, import load_digits

In [85]:
# Chargement des données digits
digits = load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

Look at the Load_digit documentation and store the numbers in a numbers variable and your target variable in a target variable.

In [86]:
numbers = digits.data
target = digits.target

Tentons de visualiser quelques nombres. Regardez d’abord la taille de votre dataset. Combien de colonnes voyez vous ?

Devinez ensuite quelle taille d’image ce nombre de colonnes devraient donner

In [87]:
numbers.shape

(1797, 64)

Now look at the documentation related to imshow from plotly. Try to view a random number. Add as a title, the number this image corresponds to. Then try to view 10 random numbers in the dataset

In [88]:
# Simulation des données
numbers = np.random.rand(100, 64)  # 100 exemples de chiffres 8x8 aplatis
target = np.random.randint(0, 10, size=100)  # Valeurs cibles aléatoires entre 0 et 9

# Visualisation de 10 chiffres aléatoires
for i in np.random.choice(len(numbers), size=10, replace=False):
    # Reshape pour restaurer la forme 8x8
    fig = px.imshow(numbers[i].reshape(8, 8), color_continuous_scale='gray')
    
    # Suppression des ticks et ajustements
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.update_layout(
        title=f"Nombre : {target[i]}",
        coloraxis_showscale=False
    )
    
    # Afficher l'image
    fig.show()

We're going to apply the KMeans to our dataset, how many clusters do you think we're going to initialize the algorithm on?

---> 10 because we have 10 numbers in the dataset!

In [89]:
np.unique(target)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Create your KMeans algorithm with the right number of clusters

In [90]:
kmeans = KMeans(init='k-means++', n_clusters=10)

Let's evaluate our model, calculate the accuracy_score of our predictions by importing the sklearn module. What do you conclude?

In [91]:
clusters = kmeans.fit_predict(numbers)

In [92]:
accuracy_score(target, clusters)
print(classification_report(target, clusters))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        10
           3       0.11      0.20      0.14        10
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         6
           6       0.27      0.40      0.32        10
           7       0.18      0.17      0.17        12
           8       0.17      0.20      0.18        15
           9       0.30      0.23      0.26        13

    accuracy                           0.14       100
   macro avg       0.10      0.12      0.11       100
weighted avg       0.12      0.14      0.13       100



Look at the coordinates of the centroids (cf. cluster_center_)

In [93]:
kmeans.cluster_centers_

array([[0.48722229, 0.33098773, 0.09681688, 0.83602892, 0.97927423,
        0.26127752, 0.43497304, 0.23518885, 0.05487188, 0.10698057,
        0.56206465, 0.95834548, 0.06528344, 0.18793536, 0.84741464,
        0.29677634, 0.02747373, 0.8815747 , 0.69341538, 0.74794707,
        0.80674246, 0.47467694, 0.645332  , 0.51993494, 0.76474779,
        0.33928906, 0.24155604, 0.17931004, 0.63388072, 0.383335  ,
        0.62341995, 0.85204229, 0.32217841, 0.54149311, 0.39597791,
        0.29423541, 0.12956121, 0.21292839, 0.22532069, 0.38359979,
        0.46938053, 0.26421825, 0.28925609, 0.84916414, 0.02723893,
        0.57871226, 0.87999875, 0.27587015, 0.94259194, 0.97362344,
        0.06045574, 0.28334925, 0.91016849, 0.02738197, 0.6305866 ,
        0.313369  , 0.56557747, 0.18128935, 0.86468769, 0.27908731,
        0.8223572 , 0.85618252, 0.67797313, 0.90954735],
       [0.34574755, 0.39671016, 0.48787641, 0.36582622, 0.27614884,
        0.41830922, 0.24577986, 0.61820425, 0.43959032, 0.5

Try to visualize each of the centroids and compare them with the different labels. What do you notice?

In [94]:
# Reshape des centroïdes pour correspondre à des images 8x8
centroids_images = kmeans.cluster_centers_.reshape(10, 8, 8)

# Visualisation de chaque centroïde
for i in range(len(centroids_images)):
    fig = px.imshow(
        centroids_images[i],
        color_continuous_scale='gray',  # Palette de couleurs en niveaux de gris
    )
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.update_layout(
        title=f"Centroïde du cluster : {i}",
        coloraxis_showscale=False  # Masquer l'échelle de couleur
    )
    fig.show()

----> Looks like good predictions! It just seems that centroids don't necessarily match the label.

We will try to match our cluster labels with the target values. Here are some clues:

a. Identify the most frequent target value for observations in cluster 1.

In [95]:
# Let's look at which clusters each of the numbers belong in
clust = pd.DataFrame(target[clusters==1])
clust.iloc[:,0].value_counts()
# In this example, we can see that it looks like only the 9 belonging to cluster 1

0
8    3
2    2
6    2
4    2
0    1
9    1
Name: count, dtype: int64

b. Programming a loop which allows to create a label vector which contains for each observation the target value corresponding to the cluster to which it belongs.

In [96]:
# Generalize: We must find the mode of each of the arrays corresponds to the desired cluster
from scipy.stats import mode

labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    labels[mask] = mode(target[mask])[0]

Re-evaluate your model. What is your new accuracy_score?

In [97]:
accuracy_score(labels, target)

0.28

In [99]:
# Create heatmap with plotly
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(target, labels)

fig = ff.create_annotated_heatmap(mat,
                                  x = digits.target_names.tolist(),
                                  y = digits.target_names.tolist())


fig.show()