In [1]:
import os
os.chdir('../')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [2]:
import pandas as pd
%matplotlib inline

In [3]:
%load_ext autoreload
%autoreload 2
from src.utils.data_loader import load_processed_data, save_csv
from src.utils.plotting import plot_metrics_vs_cluster, plot_cluster_evaluation
from src.experiments.experiment1_clustering import ClusteringExperiment

# Dataset 1

In [4]:
X_train, _, y_train, _ = load_processed_data('data/dataset1')

dataset = 'dataset1'
experiment_name = 'experiment1'
y_train = (y_train + 1) / 2

## Running clustering analysis

In [5]:
experiment = ClusteringExperiment()

In [6]:
kmean_metrics, em_metrics = experiment.run_clustering_analysis(X_train, [2, 3, 4, 5, 6, 7, 8, 9, 10])

In [7]:
# save the metrics
save_csv(kmean_metrics, f'results/{dataset}/{experiment_name}', 'kmean_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')

Dataframe saved at results/dataset1/experiment1/kmean_metrics.csv
Dataframe saved at results/dataset1/experiment1/em_metrics.csv


## Plotting metrics vs number of clusters

### KMeans

In [8]:
plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'inertia',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='kmeans'
)

Plot saved at figs/dataset1/experiment1/inertia_vs_k_kmeans.png
Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_kmeans.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_kmeans.png


### EM


In [9]:
plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'bic',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'aic',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = 'dataset1',
    experiment = 'experiment1',
    algo_name='em'
)



Plot saved at figs/dataset1/experiment1/bic_vs_k_em.png
Plot saved at figs/dataset1/experiment1/aic_vs_k_em.png
Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_em.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_em.png


## Combine EM and KMeans metrics

In [10]:
kmean_metrics_copy = kmean_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
kmean_metrics_copy['algo'] = 'kmeans'
em_metrics_copy = em_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
em_metrics_copy['algo'] = 'em'

combined_metrics = pd.concat([kmean_metrics_copy, em_metrics_copy], axis=0)
# draw the combined plot
plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = 'dataset1',
    experiment = 'experiment1'
)

plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = 'dataset1',
    experiment = 'experiment1'
)

Plot saved at figs/dataset1/experiment1/silhouette_score_vs_k_em_kmeans.png
Plot saved at figs/dataset1/experiment1/calinski_harabasz_score_vs_k_em_kmeans.png


## Supervised metrics

In [11]:
optimal_k = {
    'kmeans': 2,
    'em': 2
}
evaluation_results = experiment.evaluate_clustering(X_train, y_train, optimal_k)

In [12]:
# Analyze cluster composition for each algorithm
kmeans_labels = experiment.kmeans.fit(X_train, optimal_k['kmeans'])
em_labels = experiment.em.fit(X_train, optimal_k['em'])
kmeans_composition = experiment.analyze_cluster_composition(kmeans_labels, y_train)
em_composition = experiment.analyze_cluster_composition(em_labels, y_train)

print("\nK-Means Cluster Composition:")
print(kmeans_composition)
print("\nEM Cluster Composition:")
print(em_composition)


K-Means Cluster Composition:
True_Label        0.0        1.0   Size
Cluster                                
0           95.202765   4.797235  21700
1           76.177778  23.822222  11250

EM Cluster Composition:
True_Label        0.0        1.0   Size
Cluster                                
0           95.351870   4.648130  21471
1           76.278421  23.721579  11479


In [13]:
evaluation_results

{'kmeans': {'adjusted_rand': 0.1205605385237297,
  'normalized_mutual_info': np.float64(0.07715521390770433),
  'adjusted_mutual_info': np.float64(0.07712704835389132),
  'homogeneity': np.float64(0.10881666504885362),
  'completeness': np.float64(0.05976570085273393),
  'v_measure': np.float64(0.07715521390770433)},
 'em': {'adjusted_rand': 0.1177957347269449,
  'normalized_mutual_info': np.float64(0.07851231469374532),
  'adjusted_mutual_info': np.float64(0.07848431613410364),
  'homogeneity': np.float64(0.11122707025529976),
  'completeness': np.float64(0.06066822268503489),
  'v_measure': np.float64(0.07851231469374532)}}

In [14]:
# Plot evaluation metrics
plot_cluster_evaluation(
    evaluation_results,
    dataset = dataset,
    experiment = experiment_name
)

Plot saved at figs/dataset1/experiment1/clustering_evaluation_metrics_comparison.png


# Dataset 2

In [15]:
X_train, _, y_train, _ = load_processed_data('data/dataset2')
dataset = 'dataset2'
experiment_name = 'experiment1'

## Running clustering analysis

In [16]:
# Plot evaluation metrics
plot_cluster_evaluation(
    evaluation_results,
    dataset = dataset,
    experiment = experiment_name
)   


Plot saved at figs/dataset2/experiment1/clustering_evaluation_metrics_comparison.png


In [17]:
experiment = ClusteringExperiment()

In [18]:
kmean_metrics, em_metrics = experiment.run_clustering_analysis(X_train, [2, 3, 4, 5, 6, 7, 8, 9, 10])

# save the metrics
save_csv(kmean_metrics, f'results/{dataset}/{experiment_name}', 'kmean_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')

Dataframe saved at results/dataset2/experiment1/kmean_metrics.csv
Dataframe saved at results/dataset2/experiment1/em_metrics.csv


## Plotting metrics vs number of clusters

### KMeans

In [19]:
plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'inertia',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

plot_metrics_vs_cluster(
    df = kmean_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='kmeans'
)

Plot saved at figs/dataset2/experiment1/inertia_vs_k_kmeans.png
Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_kmeans.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_kmeans.png


### EM

In [20]:
plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'bic',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'aic',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

plot_metrics_vs_cluster(
    df = em_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    dataset = dataset,
    experiment = experiment_name,
    algo_name='em'
)

Plot saved at figs/dataset2/experiment1/bic_vs_k_em.png
Plot saved at figs/dataset2/experiment1/aic_vs_k_em.png
Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_em.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_em.png


## Combine EM and KMeans metrics

In [21]:
kmean_metrics_copy = kmean_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
kmean_metrics_copy['algo'] = 'kmeans'
em_metrics_copy = em_metrics[['k', 'silhouette_score', 'calinski_harabasz_score']].copy()
em_metrics_copy['algo'] = 'em'

combined_metrics = pd.concat([kmean_metrics_copy, em_metrics_copy], axis=0)
# draw the combined plot
plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'silhouette_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = dataset,
    experiment = experiment_name
)

plot_metrics_vs_cluster(
    df = combined_metrics,
    metric_col = 'calinski_harabasz_score',
    k_col = 'k',
    group_col= 'algo',
    dataset = dataset,
    experiment = experiment_name
)

Plot saved at figs/dataset2/experiment1/silhouette_score_vs_k_em_kmeans.png
Plot saved at figs/dataset2/experiment1/calinski_harabasz_score_vs_k_em_kmeans.png


## Supervised metrics

In [22]:
optimal_k = {
    'kmeans': 5,
    'em': 5
}
evaluation_results = experiment.evaluate_clustering(X_train, y_train, optimal_k)

# Analyze cluster composition for each algorithm
kmeans_labels = experiment.kmeans.fit(X_train, optimal_k['kmeans'])
em_labels = experiment.em.fit(X_train, optimal_k['em'])
kmeans_composition = experiment.analyze_cluster_composition(kmeans_labels, y_train)
em_composition = experiment.analyze_cluster_composition(em_labels, y_train)

print("\nK-Means Cluster Composition:")
print(kmeans_composition)
print("\nEM Cluster Composition:")
print(em_composition)


K-Means Cluster Composition:
True_Label          0          1  Size
Cluster                               
0           46.590909  53.409091   352
1           44.759207  55.240793   353
2           55.442177  44.557823   294
3           53.113553  46.886447   273
4           52.054795  47.945205   292

EM Cluster Composition:
True_Label          0          1  Size
Cluster                               
0           46.590909  53.409091   352
1           44.759207  55.240793   353
2           55.442177  44.557823   294
3           53.113553  46.886447   273
4           52.054795  47.945205   292
