# **Clustering**

Una vez que se ha reducido la dimensionalidad de la extracción de caracteristicas, se procederá a clasificarlos con **KMeans** y **Gaussian Mixture Model**

## **Load packages**

In [1]:
import numpy as np
import GMM
from importlib import reload
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
import pandas as pd

## **Useful functions**

In [27]:
from sklearn.metrics import silhouette_score, rand_score, adjusted_rand_score, mutual_info_score, normalized_mutual_info_score

def calculate_clustering_metrics(X, cluster_labels, true_labels):
    
    silhouette = silhouette_score(X, cluster_labels)  # Silhouette Score
    rand_index = rand_score(true_labels, cluster_labels)  # Rand Index
    adjusted_rand = adjusted_rand_score(true_labels, cluster_labels)  # Adjusted Rand Index
    mutual_info = mutual_info_score(true_labels, cluster_labels)  # Mutual Information
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)  # Normalized Mutual Information
    
    metrics = {
        "Silhouette Score": silhouette,
        "Rand Index (RI)": rand_index,
        "Adjusted Rand Index": adjusted_rand,
        "Mutual Information Score (MI)": mutual_info,
        "Normalized Mutual Information (NMI)": nmi
    }
    
    return metrics

def print_clustering_metrics(metrics):
    print("Clustering Metrics:")
    print("--------------------")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.6f}") 

def load_features_ids_labels(filename:str, has_labels=True):
    """
    Description:
        Cargar los features, ids y labels de un archivo npz.
        Si es data de testeo, has_labels=False
    """
    contents = np.load(filename, allow_pickle=True)
    features = contents['features']
    ids = contents['ids']
    if has_labels:
        labels = contents['labels']
        return features, ids, labels
    return features, ids

## **Load datasets**

In [3]:
train_tsne_2d = np.load('reduction/train/train_tsne_2d.npy')
test_tsne_2d = np.load('reduction/test/test_tsne_2d.npy')
val_tsne_2d = np.load('reduction/val/val_tsne_2d.npy')

train_tsne_3d = np.load('reduction/train/train_tsne_3d.npy')
test_tsne_3d = np.load('reduction/test/test_tsne_3d.npy')
val_tsne_3d = np.load('reduction/val/val_tsne_3d.npy')

train_umap_2d = np.load('reduction/train/train_umap_2d.npy')
test_umap_2d = np.load('reduction/test/test_umap_2d.npy')
val_umap_2d = np.load('reduction/val/val_umap_2d.npy')

train_umap_3d = np.load('reduction/train/train_umap_3d.npy')
test_umap_3d = np.load('reduction/test/test_umap_3d.npy')
val_umap_3d = np.load('reduction/val/val_umap_3d.npy')

y_train = np.load('reduction/train_numeric_labels.npy')

_,_, y_val = load_features_ids_labels('features_np/features_val.npz')
del _

In [4]:
unique_labels, y_val_encoded = np.unique(y_val, return_inverse=True)
y_val_encoded = y_val_encoded.flatten()

## **Modelos**

In [5]:
from kmeans_plus_plus import KMeans

### **Useful functions**

Se crearon funciones que ayudan a encontrar los mejores hiperparámetros con ayuda del dataset `val`

In [23]:
def calculate_best_hiperparameters(X_train, X_val, y_val, max_clusters=10):

    
    results = []
    tol_range = [1, 5e-1, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4]

    for tol in tol_range:

        kmeans = KMeans(n_clusters=max_clusters, tol=tol, random_state=42)
        kmeans.fit(X_train)
        
        cluster_labels = kmeans.predict(X_val) 

        metrics = calculate_clustering_metrics(X_val, cluster_labels, true_labels=y_val)
        result = {'Tolerance': tol, 'Num Iterations': kmeans.num_iter}
        result.update(metrics)
        results.append(result)
    
    results_df = pd.DataFrame(results)
    return results_df

def find_best_row_multiple_metrics(df):
    best_row = df.sort_values(by=['Silhouette Score', 'Rand Index (RI)', 'Mutual Information Score (MI)'], ascending=[False, False, False]).iloc[0:1]
    return best_row

### **t-SNE | KMeans**

In [33]:
num_clusters = len(unique_labels)  

results_tsne_2d = calculate_best_hiperparameters(train_tsne_2d, val_tsne_2d, y_val_encoded, max_clusters=num_clusters)
results_tsne_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Adjusted Rand Index,Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,14,0.483378,0.945628,0.705374,1.800261,0.790272
1,0.5,18,0.489934,0.947153,0.71476,1.814473,0.797094
2,0.1,20,0.489934,0.947153,0.71476,1.814473,0.797094
3,0.05,21,0.489934,0.947153,0.71476,1.814473,0.797094
4,0.01,21,0.489934,0.947153,0.71476,1.814473,0.797094
5,0.005,21,0.489934,0.947153,0.71476,1.814473,0.797094
6,0.001,21,0.489934,0.947153,0.71476,1.814473,0.797094
7,0.0005,21,0.489934,0.947153,0.71476,1.814473,0.797094
8,0.0001,21,0.489934,0.947153,0.71476,1.814473,0.797094


In [34]:
best_tsne_2d = find_best_row_multiple_metrics(results_tsne_2d)
best_tsne_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Adjusted Rand Index,Mutual Information Score (MI),Normalized Mutual Information (NMI)
1,0.5,18,0.489934,0.947153,0.71476,1.814473,0.797094


Por lo tanto, la mejor tolerancia encontrada es $0.5$

In [39]:
kmeans = KMeans(n_clusters=num_clusters, random_state=42, tol=best_tsne_2d['Tolerance'].values[0])
cluster_labels = kmeans.fit_predict(train_tsne_2d)

metrics = calculate_clustering_metrics(train_tsne_2d, cluster_labels, y_train)
print_clustering_metrics(metrics)


Clustering Metrics:
--------------------
Silhouette Score: 0.492065
Rand Index (RI): 0.956135
Adjusted Rand Index: 0.769656
Mutual Information Score (MI): 1.975055
Normalized Mutual Information (NMI): 0.870614


### **UMAP | KMeans**

In [36]:
num_clusters = len(unique_labels)  

results_umap_2d = calculate_best_hiperparameters(train_umap_2d, val_umap_2d, y_val_encoded, max_clusters=num_clusters)
results_umap_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Adjusted Rand Index,Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,1,0.72643,0.95842,0.769477,1.905324,0.831783
1,0.5,1,0.72643,0.95842,0.769477,1.905324,0.831783
2,0.1,2,0.72643,0.95842,0.769477,1.905324,0.831783
3,0.05,3,0.725311,0.958884,0.77169,1.907823,0.832527
4,0.01,5,0.723756,0.959878,0.776572,1.913154,0.83423
5,0.005,6,0.723756,0.959878,0.776572,1.913154,0.83423
6,0.001,7,0.723756,0.959878,0.776572,1.913154,0.83423
7,0.0005,7,0.723756,0.959878,0.776572,1.913154,0.83423
8,0.0001,7,0.723756,0.959878,0.776572,1.913154,0.83423


In [37]:
best_umap_2d = find_best_row_multiple_metrics(results_umap_2d)
best_umap_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Adjusted Rand Index,Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,1,0.72643,0.95842,0.769477,1.905324,0.831783


In [41]:
from kmeans_plus_plus import KMeans

num_clusters = len(np.unique(y_train)) 

kmeans = KMeans(n_clusters=num_clusters, random_state=42, tol=best_umap_2d['Tolerance'].values[0])
cluster_labels = kmeans.fit_predict(train_umap_2d)

metrics_umap = calculate_clustering_metrics(train_umap_2d, cluster_labels, y_train)
print_clustering_metrics(metrics_umap)

Clustering Metrics:
--------------------
Silhouette Score: 0.776862
Rand Index (RI): 0.984051
Adjusted Rand Index: 0.915143
Mutual Information Score (MI): 2.140184
Normalized Mutual Information (NMI): 0.940949
