# **Clustering**

Una vez que se ha reducido la dimensionalidad de la extracción de caracteristicas, se procederá a clasificarlos con **KMeans** y **Gaussian Mixture Model**

## **Load packages**

In [4]:
import numpy as np
import GMM
from importlib import reload
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
import pandas as pd
import os

## **Useful functions**

In [5]:
from sklearn.metrics import silhouette_score, rand_score, mutual_info_score, normalized_mutual_info_score

def calculate_clustering_metrics(X, cluster_labels, true_labels):
    
    silhouette = silhouette_score(X, cluster_labels)  # Silhouette Score
    rand_index = rand_score(true_labels, cluster_labels)  # Rand Index
    mutual_info = mutual_info_score(true_labels, cluster_labels)  # Mutual Information
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)  # Normalized Mutual Information
    
    metrics = {
        "Silhouette Score": silhouette,
        "Rand Index (RI)": rand_index,
        "Mutual Information Score (MI)": mutual_info,
        "Normalized Mutual Information (NMI)": nmi
    }
    
    return metrics

def print_clustering_metrics(metrics):
    print("Clustering Metrics:")
    print("--------------------")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.6f}") 

def load_features_ids_labels(filename:str, has_labels=True):
    """
    Description:
        Cargar los features, ids y labels de un archivo npz.
        Si es data de testeo, has_labels=False
    """
    contents = np.load(filename, allow_pickle=True)
    features = contents['features']
    ids = contents['ids']
    if has_labels:
        labels = contents['labels']
        return features, ids, labels
    return features, ids

## **Load datasets**

In [6]:
train_tsne_2d = np.load('reduction/train/train_tsne_2d.npy')
test_tsne_2d = np.load('reduction/test/test_tsne_2d.npy')
val_tsne_2d = np.load('reduction/val/val_tsne_2d.npy')

train_tsne_3d = np.load('reduction/train/train_tsne_3d.npy')
test_tsne_3d = np.load('reduction/test/test_tsne_3d.npy')
val_tsne_3d = np.load('reduction/val/val_tsne_3d.npy')

train_umap_2d = np.load('reduction/train/train_umap_2d.npy')
test_umap_2d = np.load('reduction/test/test_umap_2d.npy')
val_umap_2d = np.load('reduction/val/val_umap_2d.npy')

train_umap_3d = np.load('reduction/train/train_umap_3d.npy')
test_umap_3d = np.load('reduction/test/test_umap_3d.npy')
val_umap_3d = np.load('reduction/val/val_umap_3d.npy')

y_train = np.load('reduction/train_numeric_labels.npy')

_,_, y_val = load_features_ids_labels('features_np/features_val.npz')
del _

In [7]:
unique_labels, y_val_encoded = np.unique(y_val, return_inverse=True)
y_val_encoded = y_val_encoded.flatten()

## **Modelos**

In [8]:
from kmeans_plus_plus import KMeans

### **Useful functions**

Se crearon funciones que ayudan a encontrar los mejores hiperparámetros con ayuda del dataset `val`

In [9]:
def calculate_best_hyperparameters(X_train, X_val, y_val, max_clusters=10):

    
    results = []
    tol_range = [1, 5e-1, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4]

    for tol in tol_range:

        kmeans = KMeans(n_clusters=max_clusters, tol=tol, random_state=42)
        kmeans.fit(X_train)
        
        cluster_labels = kmeans.predict(X_val) 

        metrics = calculate_clustering_metrics(X_val, cluster_labels, true_labels=y_val)
        result = {'Tolerance': tol, 'Num Iterations': kmeans.num_iter}
        result.update(metrics)
        results.append(result)
    
    results_df = pd.DataFrame(results)
    return results_df

def find_best_row_multiple_metrics(df):

    best_sil = df['Silhouette Score'].idxmax()
    best_ri = df['Rand Index (RI)'].idxmax()
    best_mi = df['Mutual Information Score (MI)'].idxmax()

    votes = {idx: 0 for idx in df.index}
    votes[best_sil] += 1
    votes[best_ri] += 1
    votes[best_mi] += 1

    best_index = max(votes, key=votes.get)

    return df.loc[[best_index]]

In [10]:
num_clusters = len(unique_labels) 

### **KMeans | t-SNE - 2 components**

In [11]:
results_tsne_2d = calculate_best_hyperparameters(train_tsne_2d, val_tsne_2d, y_val_encoded, max_clusters=num_clusters)
results_tsne_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,14,0.483378,0.945628,1.800261,0.790272
1,0.5,18,0.489934,0.947153,1.814473,0.797094
2,0.1,20,0.489934,0.947153,1.814473,0.797094
3,0.05,21,0.489934,0.947153,1.814473,0.797094
4,0.01,21,0.489934,0.947153,1.814473,0.797094
5,0.005,21,0.489934,0.947153,1.814473,0.797094
6,0.001,21,0.489934,0.947153,1.814473,0.797094
7,0.0005,21,0.489934,0.947153,1.814473,0.797094
8,0.0001,21,0.489934,0.947153,1.814473,0.797094


In [12]:
best_tsne_2d = find_best_row_multiple_metrics(results_tsne_2d)
best_tsne_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
1,0.5,18,0.489934,0.947153,1.814473,0.797094


Por lo tanto, la mejor tolerancia encontrada es $0.5$

In [13]:
kmeans_tsne_2d = KMeans(n_clusters=num_clusters, random_state=42, tol=best_tsne_2d['Tolerance'].values[0])
cluster_labels_tsne_2d = kmeans_tsne_2d.fit_predict(train_tsne_2d)

metrics_tsne_2d = calculate_clustering_metrics(train_tsne_2d, cluster_labels_tsne_2d, y_train)
print_clustering_metrics(metrics_tsne_2d)

Clustering Metrics:
--------------------
Silhouette Score: 0.492065
Rand Index (RI): 0.956135
Mutual Information Score (MI): 1.975055
Normalized Mutual Information (NMI): 0.870614


### **KMeans | t-SNE - 3 components**

In [14]:
results_tsne_3d = calculate_best_hyperparameters(train_tsne_3d, val_tsne_3d, y_val_encoded, max_clusters=num_clusters)
results_tsne_3d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,11,0.368669,0.956244,1.858218,0.811378
1,0.5,13,0.368008,0.956752,1.86099,0.812226
2,0.1,16,0.368008,0.956752,1.86099,0.812226
3,0.05,18,0.36746,0.957283,1.863876,0.813146
4,0.01,18,0.36746,0.957283,1.863876,0.813146
5,0.005,18,0.36746,0.957283,1.863876,0.813146
6,0.001,18,0.36746,0.957283,1.863876,0.813146
7,0.0005,18,0.36746,0.957283,1.863876,0.813146
8,0.0001,18,0.36746,0.957283,1.863876,0.813146


In [15]:
best_tsne_3d = find_best_row_multiple_metrics(results_tsne_3d)
best_tsne_3d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
3,0.05,18,0.36746,0.957283,1.863876,0.813146


In [16]:
kmeans_tsne_3d = KMeans(n_clusters=num_clusters, random_state=42, tol=best_tsne_3d['Tolerance'].values[0])
cluster_labels_tsne_3d = kmeans_tsne_3d.fit_predict(train_tsne_3d)

metrics_tsne_3d = calculate_clustering_metrics(train_tsne_3d, cluster_labels_tsne_3d, y_train)
print_clustering_metrics(metrics_tsne_3d)

Clustering Metrics:
--------------------
Silhouette Score: 0.413569
Rand Index (RI): 0.980594
Mutual Information Score (MI): 2.093024
Normalized Mutual Information (NMI): 0.919907


### **KMeans | UMAP - 2 componentes**

In [17]:
results_umap_2d = calculate_best_hyperparameters(train_umap_2d, val_umap_2d, y_val_encoded, max_clusters=num_clusters)
results_umap_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,1,0.76844,0.94533,1.831711,0.812378
1,0.5,1,0.76844,0.94533,1.831711,0.812378
2,0.1,3,0.770126,0.945363,1.831711,0.812453
3,0.05,4,0.770126,0.945363,1.831711,0.812453
4,0.01,6,0.770126,0.945363,1.831711,0.812453
5,0.005,7,0.770126,0.945363,1.831711,0.812453
6,0.001,11,0.770126,0.945363,1.831711,0.812453
7,0.0005,11,0.770126,0.945363,1.831711,0.812453
8,0.0001,11,0.770126,0.945363,1.831711,0.812453


In [18]:
best_umap_2d = find_best_row_multiple_metrics(results_umap_2d)
best_umap_2d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
2,0.1,3,0.770126,0.945363,1.831711,0.812453


In [19]:
kmeans_umap_2d = KMeans(n_clusters=num_clusters, random_state=42, tol=best_umap_2d['Tolerance'].values[0])
cluster_labels_umap_2d = kmeans_umap_2d.fit_predict(train_umap_2d)

metrics_umap_2d = calculate_clustering_metrics(train_umap_2d, cluster_labels_umap_2d, y_train)
print_clustering_metrics(metrics_umap_2d)

Clustering Metrics:
--------------------
Silhouette Score: 0.800982
Rand Index (RI): 0.968290
Mutual Information Score (MI): 2.065485
Normalized Mutual Information (NMI): 0.925883


### **KMeans | UMAP - 3 componentes**

In [20]:
results_umap_3d = calculate_best_hyperparameters(train_umap_3d, val_umap_3d, y_val_encoded, max_clusters=num_clusters)
results_umap_3d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
0,1.0,1,0.724054,0.958619,1.895212,0.82662
1,0.5,1,0.724054,0.958619,1.895212,0.82662
2,0.1,2,0.724636,0.958487,1.896803,0.827843
3,0.05,3,0.724506,0.957879,1.893571,0.82673
4,0.01,6,0.724553,0.957294,1.890474,0.825697
5,0.005,6,0.724553,0.957294,1.890474,0.825697
6,0.001,12,0.724553,0.957294,1.890474,0.825697
7,0.0005,12,0.724553,0.957294,1.890474,0.825697
8,0.0001,12,0.724553,0.957294,1.890474,0.825697


In [21]:
best_umap_3d = find_best_row_multiple_metrics(results_umap_3d)
best_umap_3d

Unnamed: 0,Tolerance,Num Iterations,Silhouette Score,Rand Index (RI),Mutual Information Score (MI),Normalized Mutual Information (NMI)
2,0.1,2,0.724636,0.958487,1.896803,0.827843


In [22]:
kmeans_umap_3d = KMeans(n_clusters=num_clusters, random_state=42, tol=best_umap_3d['Tolerance'].values[0])
cluster_labels_umap_3d = kmeans_umap_3d.fit_predict(train_umap_3d)

metrics_umap_3d = calculate_clustering_metrics(train_umap_3d, cluster_labels_umap_3d, y_train)
print_clustering_metrics(metrics_umap_3d)

Clustering Metrics:
--------------------
Silhouette Score: 0.780104
Rand Index (RI): 0.986301
Mutual Information Score (MI): 2.147657
Normalized Mutual Information (NMI): 0.944350


## **Save cluster files**

Una vez que se aplicó `fit` a cada uno de los modelos, ahora se harán las predicciones con el dataset de `test_subset_10.csv` y se guardarán en un archivo `.csv` para la competencia de Kaggle.

In [23]:
_, test_ids = load_features_ids_labels('features_np/features_test.npz', has_labels=False)
os.makedirs('submission', exist_ok=True)

In [24]:
# TSNE 2D
test_pred_tsne_2d = kmeans_tsne_2d.predict(test_tsne_2d)
km_tsne_2d = pd.DataFrame({'youtube_id':test_ids.flatten(), 'label': test_pred_tsne_2d.flatten()})
km_tsne_2d.to_csv('submission/km_tsne_2d.csv', index=False)

# TSNE 3D
test_pred_tsne_3d = kmeans_tsne_3d.predict(test_tsne_3d)
km_tsne_3d = pd.DataFrame({'youtube_id':test_ids.flatten(), 'label': test_pred_tsne_3d.flatten()})
km_tsne_3d.to_csv('submission/km_tsne_3d.csv', index=False)

# UMAP 2D
test_pred_umap_2d = kmeans_umap_2d.predict(test_umap_2d)
km_umap_2d = pd.DataFrame({'youtube_id':test_ids.flatten(), 'label': test_pred_umap_2d.flatten()})
km_umap_2d.to_csv('submission/km_umap_2d.csv', index=False)

# UMAP 3D
test_pred_umap_3d = kmeans_umap_3d.predict(test_umap_3d)
km_umap_3d = pd.DataFrame({'youtube_id':test_ids.flatten(), 'label': test_pred_umap_3d.flatten()})
km_umap_3d.to_csv('submission/km_umap_3d.csv', index=False)