# Homework

1. Do clustering on the digits dataset from sklearn using at least three different approaches (k-means, hierarchical, birch)
2. Evaluate the quality of clustering using three metrics from the lecture. Apply dimensionality reduction, then do clustering and calculate metrics.
3. Make a comparison table of different clustering approaches with and without dimensionality reduction.
4. Make a conclusion about how to perform validation and what metrics to use in a real-life task

In this task, we want to use 10 clusters, as we have 10 digits.

In [36]:
# load the digits dataset

# train k-means on the digits dataset
# calculate metrics for the k-means model: silhouette_score, adjusted_rand_score, adjusted_mutual_info_score
# save the metrics
# do the same for other two clustering algorithms of your choice

from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, SpectralClustering, Birch
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score

digits = datasets.load_digits()
tsne = TSNE()
embeddings_tsne = tsne.fit_transform(digits.data)

In [43]:
# train k-means on the digits dataset
model_kmeans = KMeans(n_clusters=10)
model_kmeans.fit(digits.data)
data_clusters = model_kmeans.predict(digits.data)
# calculate metrics for the k-means model: silhouette_score, adjusted_rand_score, adjusted_mutual_info_score

silhouette_km = silhouette_score(digits.data, data_clusters)
ari_km = adjusted_rand_score(digits.target, data_clusters)
ami_km = adjusted_mutual_info_score(digits.target, data_clusters)

In [44]:
# save the metrics
kmeans_metrics = silhouette_km, ari_km, ami_km
kmeans_metrics

(0.18826050698876154, 0.6554583959905361, 0.7489677266765028)

In [34]:
# do the same for other two clustering algorithms of your choice
# spectral clustering
spectral_model = SpectralClustering(n_clusters=10, random_state=42, affinity='nearest_neighbors')
spectral_clusters = spectral_model.fit_predict(digits.data)

silhouette_spectral = silhouette_score(digits.data, spectral_clusters)
ari_spectral = adjusted_rand_score(digits.target, spectral_clusters)
ami_spectral = adjusted_mutual_info_score(digits.target, spectral_clusters)

In [35]:
silhouette_metrics = silhouette_spectral, ari_spectral, ami_spectral
silhouette_metrics

(0.18272860285856599, 0.7564608880380487, 0.8520396374862457)

In [39]:
# BIRCH
birch_model = Birch(n_clusters=10)
birch_clusters = birch_model.fit_predict(digits.data)

silhouette_birch = silhouette_score(digits.data, birch_clusters)
ari_birch = adjusted_rand_score(digits.target, birch_clusters)
ami_birch = adjusted_mutual_info_score(digits.target, birch_clusters)

In [40]:
birch_metrics = silhouette_birch, ari_birch, ami_birch
birch_metrics

(0.17849659940596496, 0.7940031835568753, 0.8668321489750319)

In [75]:
digits_metrics = kmeans_metrics, silhouette_metrics, birch_metrics
digits_metrics

((0.18826050698876154, 0.6554583959905361, 0.7489677266765028),
 (0.18272860285856599, 0.7564608880380487, 0.8520396374862457),
 (0.17849659940596496, 0.7940031835568753, 0.8668321489750319))

In [45]:
# apply PCA to the dataset
from sklearn.decomposition import PCA


pca = PCA(n_components=2)
X_pca = pca.fit_transform(digits.data)

# train the same clustering algorithms on the PCA-transformed dataset
# calculate the same metrics for the PCA-transformed dataset
# save the metrics
# do the same with TSNE and UMAP transformations



In [49]:
# KMeans
model_kmeans_pca = KMeans(n_clusters=10, random_state=42)
model_kmeans_pca.fit(X_pca)
data_clusters_kmeans_pca = model_kmeans_pca.predict(X_pca)

silhouette_kmeans_pca = silhouette_score(X_pca, data_clusters_kmeans_pca)
ari_kmeans_pca = adjusted_rand_score(digits.target, data_clusters_kmeans_pca)
ami_kmeans_pca = adjusted_mutual_info_score(digits.target, data_clusters_kmeans_pca)

pca_kmeans_metrics = silhouette_kmeans_pca, ari_kmeans_pca, ami_kmeans_pca
pca_kmeans_metrics

(0.4003584991908948, 0.36027928369709045, 0.509809700094859)

In [51]:
# Spectral clustering
spectral_model_pca = SpectralClustering(n_clusters=10, random_state=42, affinity='nearest_neighbors')
spectral_clusters_pca = spectral_model_pca.fit_predict(X_pca)

silhouette_spectral_pca = silhouette_score(X_pca, spectral_clusters_pca)
ari_spectral_pca = adjusted_rand_score(digits.target, spectral_clusters_pca)
ami_spectral_pca = adjusted_mutual_info_score(digits.target, spectral_clusters_pca)

pca_spectral_metrics = silhouette_spectral_pca, ari_spectral_pca, ami_spectral_pca
pca_spectral_metrics

(0.3771817281176335, 0.36074987488964066, 0.5184746702052185)

In [52]:
# BIRCH
birch_model_pca = Birch(n_clusters=10)
birch_clusters_pca = birch_model_pca.fit_predict(X_pca)

silhouette_birch_pca = silhouette_score(X_pca, birch_clusters_pca)
ari_birch_pca = adjusted_rand_score(digits.target, birch_clusters_pca)
ami_birch_pca = adjusted_mutual_info_score(digits.target, birch_clusters_pca)


pca_birch_metrics = silhouette_birch_pca, ari_birch_pca, ami_birch_pca
pca_birch_metrics

(0.3168153424420648, 0.36997579338357345, 0.49343717177935814)

In [74]:
pca_metrics = pca_kmeans_metrics, pca_spectral_metrics, pca_birch_metrics
pca_metrics

((0.4003584991908948, 0.36027928369709045, 0.509809700094859),
 (0.3771817281176335, 0.36074987488964066, 0.5184746702052185),
 (0.3168153424420648, 0.36997579338357345, 0.49343717177935814))

In [58]:
#Umap
import umap.umap_ as umap


umap_model = umap.UMAP(n_components=2)
X_umap = umap_model.fit_transform(digits.data)

# Train KMeans on UMAP-transformed data
model_kmeans_umap = KMeans(n_clusters=10, random_state=42)
model_kmeans_umap.fit(X_umap)
data_clusters_kmeans_umap = model_kmeans_umap.predict(X_umap)

# Train Spectral Clustering on UMAP-transformed data
spectral_model_umap = SpectralClustering(n_clusters=10, random_state=42, affinity='nearest_neighbors', n_neighbors=15)
spectral_clusters_umap = spectral_model_umap.fit_predict(X_umap)

# Train Birch Clustering on UMAP-transformed data
birch_model_umap = Birch(n_clusters=10)
birch_clusters_umap = birch_model_umap.fit_predict(X_umap)



In [73]:
# Calculate metrics for KMeans on UMAP-transformed data
silhouette_kmeans_umap = silhouette_score(X_umap, data_clusters_kmeans_umap)
ari_kmeans_umap = adjusted_rand_score(digits.target, data_clusters_kmeans_umap)
ami_kmeans_umap = adjusted_mutual_info_score(digits.target, data_clusters_kmeans_umap)

# Calculate metrics for Spectral Clustering on UMAP-transformed data
silhouette_spectral_umap = silhouette_score(X_umap, spectral_clusters_umap)
ari_spectral_umap = adjusted_rand_score(digits.target, spectral_clusters_umap)
ami_spectral_umap = adjusted_mutual_info_score(digits.target, spectral_clusters_umap)

# Calculate metrics for Birch Clustering on UMAP-transformed data
silhouette_birch_umap = silhouette_score(X_umap, birch_clusters_umap)
ari_birch_umap = adjusted_rand_score(digits.target, birch_clusters_umap)
ami_birch_umap = adjusted_mutual_info_score(digits.target, birch_clusters_umap)


kmean_umap_metrics = silhouette_kmeans_umap, ari_kmeans_umap, ami_kmeans_umap
spectral_umap_metrics = silhouette_spectral_umap, ari_spectral_umap, ami_spectral_umap
birch_umap_metrics = silhouette_birch_umap, ari_birch_umap, ami_birch_umap

umap_metrics = kmean_umap_metrics, spectral_umap_metrics, birch_umap_metrics
umap_metrics

((0.7948192, 0.820685984210742, 0.9034593114504199),
 (-0.1819423, 0.17053367335538905, 0.5343717874889509),
 (0.7948192, 0.820685984210742, 0.9034593114504201))

In [65]:
#TSNE
tsne_model = TSNE(n_components=2, random_state=42, max_iter=1000)
X_tsne = tsne_model.fit_transform(digits.data)

In [72]:
model_kmeans_tsne = KMeans(n_clusters=10, random_state=42)
model_kmeans_tsne.fit(X_tsne)
data_clusters_kmeans_tsne = model_kmeans_tsne.predict(X_tsne)

# Train Spectral Clustering on t-SNE-transformed data
spectral_model_tsne = SpectralClustering(n_clusters=10, random_state=42, affinity='nearest_neighbors')
spectral_clusters_tsne = spectral_model_tsne.fit_predict(X_tsne)

# Train Birch Clustering on t-SNE-transformed data
birch_model_tsne = Birch(n_clusters=10)
birch_clusters_tsne = birch_model_tsne.fit_predict(X_tsne)

# Calculate metrics for KMeans on t-SNE-transformed data
silhouette_kmeans_tsne = silhouette_score(X_tsne, data_clusters_kmeans_tsne)
ari_kmeans_tsne = adjusted_rand_score(digits.target, data_clusters_kmeans_tsne)
ami_kmeans_tsne = adjusted_mutual_info_score(digits.target, data_clusters_kmeans_tsne)

# Calculate metrics for Spectral Clustering on t-SNE-transformed data
silhouette_spectral_tsne = silhouette_score(X_tsne, spectral_clusters_tsne)
ari_spectral_tsne = adjusted_rand_score(digits.target, spectral_clusters_tsne)
ami_spectral_tsne = adjusted_mutual_info_score(digits.target, spectral_clusters_tsne)

# Calculate metrics for Birch Clustering on t-SNE-transformed data
silhouette_birch_tsne = silhouette_score(X_tsne, birch_clusters_tsne)
ari_birch_tsne = adjusted_rand_score(digits.target, birch_clusters_tsne)
ami_birch_tsne = adjusted_mutual_info_score(digits.target, birch_clusters_tsne)


kmeans_tsne_metrics = silhouette_kmeans_tsne, ari_kmeans_tsne, ami_kmeans_tsne
spectral_tsne_metrics = silhouette_spectral_tsne, ari_spectral_tsne, ami_spectral_tsne
birch_tsne_metrics = silhouette_birch_tsne, ari_birch_tsne, ami_birch_tsne

tsne_metrics = kmeans_tsne_metrics, spectral_tsne_metrics, birch_tsne_metrics
tsne_metrics



((0.640311, 0.8814982769237074, 0.9065868102431197),
 (-0.13350049, 0.2140155233039428, 0.5857424914900585),
 (0.6423378, 0.9002355914417323, 0.9201660724550942))

In [80]:
print(digits_metrics)
print(pca_metrics)
print(umap_metrics)
print(tsne_metrics)

((0.18826050698876154, 0.6554583959905361, 0.7489677266765028), (0.18272860285856599, 0.7564608880380487, 0.8520396374862457), (0.17849659940596496, 0.7940031835568753, 0.8668321489750319))
((0.4003584991908948, 0.36027928369709045, 0.509809700094859), (0.3771817281176335, 0.36074987488964066, 0.5184746702052185), (0.3168153424420648, 0.36997579338357345, 0.49343717177935814))
((0.7948192, 0.820685984210742, 0.9034593114504199), (-0.1819423, 0.17053367335538905, 0.5343717874889509), (0.7948192, 0.820685984210742, 0.9034593114504201))
((0.640311, 0.8814982769237074, 0.9065868102431197), (-0.13350049, 0.2140155233039428, 0.5857424914900585), (0.6423378, 0.9002355914417323, 0.9201660724550942))


In [None]:
# compare the metrics for the original dataset and the transformed datasets
# make a conclusion about the quality of the clustering algorithms and the transformations

In [None]:
# what metrics are the most informative in this case?
# what metrics should be used to compare the quality of the clustering algorithms and the transformations in real-life, when the true labels are unknown?
# provide an example of a real-life task and explain the validation strategy and metrics for it 