## Section 3 – Unsupervised learning – clustering

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from wordcloud import WordCloud
from sklearn.manifold import TSNE

In [3]:
df = pd.read_parquet("df_tfidf.parquet")

X = df.drop(columns=["Set_Fingerprint"])

## 3.1 Determine the number of clusters

### 3.1.1 K-means

In [None]:
# Within-Cluster Sum of Squares (WCSS): For each value of k (number of clusters), 
# calculate the sum of squared distances (WCSS) between points and their respective cluster centroids.
wcss = []
silhouette_scores_kmeans = []
n_cluster_list_kmeans = []


# Compute WCSS for different values of k
for n_clusters in range(3, 10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit(X)
    
    wcss.append(kmeans.inertia_)  # WCSS is stored in the `inertia_` attribute
    silhouette_scores_kmeans.append(silhouette_score(X, kmeans.labels_))
    n_cluster_list_kmeans.append(n_clusters)



In [None]:
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(n_cluster_list_kmeans, wcss, marker='o', linestyle='--', color='blue')
plt.title('Elbow Method for Kmeans')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.xticks(range(3, 8))

# Plot the Silhouette Method
plt.subplot(1, 2, 2)
plt.plot(n_cluster_list_kmeans, silhouette_scores_kmeans, marker='o', linestyle='--', color='orange')
plt.title('Silhouette Method for Kmeans')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(range(3, 8))

plt.tight_layout()
plt.show()

### 3.1.2 GMM

In [None]:
n_cluster_list_gmm =[]
silhouette_list_gmm = []
log_l_list=[]

for n_clusters in range(3, 10):
    gmm = GaussianMixture(n_components=n_clusters)
    cl_labels = gmm.fit_predict(X)
    n_cluster_list_gmm.append(n_clusters)
    silhouette_list_gmm.append(silhouette_score(X, cl_labels))
    log_l_list.append(gmm.score(X))


In [None]:
print(silhouette_list_gmm)
print(log_l_list)

In [None]:
best_silhouette= np.max(silhouette_list) # Best silhouette
best_n_cluster = n_cluster_list_gmm[np.argmax(silhouette_list)] # Get n_clusters related to the best silhouette
print("best k: ",best_n_cluster, ", corresponding silhouette: ", best_silhouette)

# Plot
plt.figure(figsize=(5, 3.5))
plt.plot(n_cluster_list_gmm,silhouette_list, marker='o', markersize=5)
plt.scatter(best_n_cluster, best_silhouette, color='r', marker='x', s=90)
plt.grid()
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.show()

plt.figure(figsize=(5, 3.5))
plt.plot(n_cluster_list_gmm,log_l_list, marker='o', markersize=5)
plt.grid()
plt.xlabel('Number of clusters')
plt.ylabel('GMM total log-likelihood score')
plt.show()

## 3.2 Tuning other hyperparameters

In [None]:
param_grid_kmeans = {
    'init': ['k-means++', 'random'],
    'n_init':  [1, 4, 10],
    'max_iter':  [50, 100, 150],
}

kmeans = KMeans(n_clusters=8, random_state=42)
grid_search_kmeans = GridSearchCV(kmeans, param_grid = param_grid_kmeans, cv=3)
grid_search_kmeans.fit(X)

best_params_kmeans = grid_search_kmeans.best_params_
print("Best parameters:", best_params_kmeans)


In [None]:
kmeans_t = KMeans(n_clusters = 8, init = best_params_kmeans['init'], n_init = best_params_kmeans['n_init'], max_iter = best_params_kmeans['max_iter'], random_state=42)
cl_labels_tuned = kmeans_t.fit_predict(X)
    
silhouette_kmeans_tuned = silhouette_score(X, kmeans_t.labels_)
wcss = (kmeans_t.inertia_)
print("Silhouette: ", silhouette_kmeans_tuned)
print("Inertia: ", wcss)

### 3.2.2 GMM

In [10]:
param_grid_gmm = {
    'covariance_type': ['full', 'spherical'],
    'tol': [1e-3, 1e-4], 
    'max_iter': [50, 100, 150],
}

def silhouette_scoring(gmm, X):
    labels = gmm.fit_predict(X)
    return silhouette_score(X, labels)

# Create GaussianMixture object
gmm_to_tune = GaussianMixture(n_components=8,random_state=42)

grid_search_gmm = GridSearchCV(gmm_to_tune, param_grid_gmm, scoring = silhouette_scoring, cv = 3)
grid_search_gmm.fit(X)

best_params_gmm = grid_search_gmm.best_params_
print("Best parameters:", best_params_gmm)

Best parameters: {'covariance_type': 'full', 'max_iter': 50, 'tol': 0.001}


In [None]:
w qbQ 

In [12]:
gmm_t = GaussianMixture(n_components = 10, random_state=42, covariance_type=best_params_gmm['covariance_type'], max_iter=best_params_gmm['max_iter'], tol=best_params_gmm['tol'])
gmm_label_t = gmm_t.fit_predict(X)
silhouette_gmm_tuned = silhouette_score(X, gmm_label_t)
log_l_t = (gmm_t.score(X))
print("Silhouette: ", silhouette_gmm_tuned)
print("Log-l: ", log_l_t)

Silhouette:  0.8874511047300687
Log-l:  238.7523602706648
