In [1]:
import os
os.chdir('../')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [2]:
import pandas as pd
import numpy as np
%matplotlib inline

In [3]:
%load_ext autoreload
%autoreload 2
from src.utils.data_loader import load_processed_data, save_csv
from src.utils.plotting import plot_metrics_vs_cluster, plot_cluster_evaluation
from src.experiments.experiment1_clustering import ClusteringExperiment

# Dataset 1

In [4]:
X_train, X_test, y_train, y_test = load_processed_data('data/dataset1')

# combine the train and test data
X_train = np.concatenate([X_train, X_test], axis=0)
y_train = np.concatenate([y_train, y_test], axis=0)
dataset = 'dataset1'
experiment_name = 'experiment1'
y_train = (y_train + 1) / 2

## Running clustering analysis

In [5]:
experiment = ClusteringExperiment()

In [6]:
kmean_metrics, em_metrics = experiment.run_clustering_analysis(X_train, [2, 3, 4, 5, 6, 7, 8, 9, 10])

In [7]:
# save the metrics
save_csv(kmean_metrics, f'results/{dataset}/{experiment_name}', 'kmean_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')

Dataframe saved at results/dataset1/experiment1/kmean_metrics.csv
Dataframe saved at results/dataset1/experiment1/em_metrics.csv


## Plotting metrics vs number of clusters

### KMeans

In [8]:
plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'inertia',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

Plot saved at figs/dataset1/experiment1/inertia_vs_k_kmeans.png
Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_kmeans.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_kmeans.png


### EM


In [9]:
plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'bic',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'aic',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)



Plot saved at figs/dataset1/experiment1/bic_vs_k_em.png
Plot saved at figs/dataset1/experiment1/aic_vs_k_em.png
Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_em.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_em.png


## Combine EM and KMeans metrics

In [10]:
kmean_metrics_copy = kmean_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
kmean_metrics_copy['algo'] = 'kmeans'
em_metrics_copy = em_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
em_metrics_copy['algo'] = 'em'

combined_metrics = pd.concat([kmean_metrics_copy, em_metrics_copy], axis=0)
# draw the combined plot
plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = 'dataset1',
    experiment = 'experiment1'
)

plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = 'dataset1',
    experiment = 'experiment1'
)

Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_em_kmeans.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_em_kmeans.png


## Supervised metrics

In [11]:
optimal_k = {
    'kmeans': 2,
    'em': 2
}
evaluation_results = experiment.evaluate_clustering(X_train, y_train, optimal_k)

In [12]:
# Analyze cluster composition for each algorithm
kmeans_labels = experiment.kmeans.fit(X_train, optimal_k['kmeans'])
em_labels = experiment.em.fit(X_train, optimal_k['em'])
kmeans_composition = experiment.analyze_cluster_composition(kmeans_labels, y_train)
em_composition = experiment.analyze_cluster_composition(em_labels, y_train)

print("\nK-Means Cluster Composition:")
print(kmeans_composition)
print("\nEM Cluster Composition:")
print(em_composition)


K-Means Cluster Composition:
True_Label        0.0        1.0   Size
Cluster                                
0           76.367770  23.632230  14074
1           95.153795   4.846205  27114

EM Cluster Composition:
True_Label        0.0        1.0   Size
Cluster                                
0           76.416475  23.583525  14349
1           95.320243   4.679757  26839


In [13]:
evaluation_results

{'kmeans': {'adjusted_rand': 0.11861320217946517,
  'normalized_mutual_info': np.float64(0.07543964029224692),
  'adjusted_mutual_info': np.float64(0.07541705831893174),
  'homogeneity': np.float64(0.10652688887140759),
  'completeness': np.float64(0.058397707726779526),
  'v_measure': np.float64(0.07543964029224691)},
 'em': {'adjusted_rand': 0.1165391720599892,
  'normalized_mutual_info': np.float64(0.07730453817491507),
  'adjusted_mutual_info': np.float64(0.07728209837408168),
  'homogeneity': np.float64(0.10963013387832135),
  'completeness': np.float64(0.05970105535358838),
  'v_measure': np.float64(0.07730453817491507)}}

In [14]:
# Plot evaluation metrics
plot_cluster_evaluation(
    evaluation_results,
    dataset = dataset,
    experiment = experiment_name
)

Plot saved at figs/dataset1/experiment1/clustering_evaluation_metrics_comparison.png


# Dataset 2

In [15]:
X_train, X_test, y_train, y_test = load_processed_data('data/dataset2')
X_train = np.concatenate([X_train, X_test], axis=0)
y_train = np.concatenate([y_train, y_test], axis=0)
dataset = 'dataset2'
experiment_name = 'experiment1'

## Running clustering analysis

In [16]:
# Plot evaluation metrics
plot_cluster_evaluation(
    evaluation_results,
    dataset = dataset,
    experiment = experiment_name
)   


Plot saved at figs/dataset2/experiment1/clustering_evaluation_metrics_comparison.png


In [17]:
experiment = ClusteringExperiment()

In [18]:
kmean_metrics, em_metrics = experiment.run_clustering_analysis(X_train, [2, 3, 4, 5, 6, 7, 8, 9, 10])

# save the metrics
save_csv(kmean_metrics, f'results/{dataset}/{experiment_name}', 'kmean_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')

Dataframe saved at results/dataset2/experiment1/kmean_metrics.csv
Dataframe saved at results/dataset2/experiment1/em_metrics.csv


## Plotting metrics vs number of clusters

### KMeans

In [19]:
plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'inertia',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

Plot saved at figs/dataset2/experiment1/inertia_vs_k_kmeans.png
Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_kmeans.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_kmeans.png


### EM

In [20]:
plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'bic',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'aic',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

Plot saved at figs/dataset2/experiment1/bic_vs_k_em.png
Plot saved at figs/dataset2/experiment1/aic_vs_k_em.png
Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_em.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_em.png


## Combine EM and KMeans metrics

In [21]:
kmean_metrics_copy = kmean_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
kmean_metrics_copy['algo'] = 'kmeans'
em_metrics_copy = em_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
em_metrics_copy['algo'] = 'em'

combined_metrics = pd.concat([kmean_metrics_copy, em_metrics_copy], axis=0)
# draw the combined plot
plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = dataset,
    experiment = experiment_name
)

plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = dataset,
    experiment = experiment_name
)

Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_em_kmeans.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_em_kmeans.png


## Supervised metrics

In [22]:
optimal_k = {
    'kmeans': 5,
    'em': 5
}
evaluation_results = experiment.evaluate_clustering(X_train, y_train, optimal_k)

# Analyze cluster composition for each algorithm
kmeans_labels = experiment.kmeans.fit(X_train, optimal_k['kmeans'])
em_labels = experiment.em.fit(X_train, optimal_k['em'])
kmeans_composition = experiment.analyze_cluster_composition(kmeans_labels, y_train)
em_composition = experiment.analyze_cluster_composition(em_labels, y_train)

print("\nK-Means Cluster Composition:")
print(kmeans_composition)
print("\nEM Cluster Composition:")
print(em_composition)


K-Means Cluster Composition:
True_Label          0          1  Size
Cluster                               
0           53.551913  46.448087   366
1           50.143266  49.856734   349
2           49.157303  50.842697   356
3           46.118721  53.881279   438
4           45.413870  54.586130   447

EM Cluster Composition:
True_Label          0          1  Size
Cluster                               
0           53.551913  46.448087   366
1           50.143266  49.856734   349
2           49.157303  50.842697   356
3           46.118721  53.881279   438
4           45.413870  54.586130   447
