In [1]:
import os
from random import sample

os.chdir('..')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [2]:
import pandas as pd
import numpy as np

In [38]:
%load_ext autoreload
%autoreload 2
from src.experiments.experiment3_combined import CombinedExperiment, find_optimal_combinations, evaluate_clustering, run_combined_cluster
from src.utils.data_loader import load_processed_data, save_csv, load_csv, save_csv, save_pickle, load_pickle
from src.utils.evaluation import find_elbow_indice
from src.utils.plotting import plot_metrics_vs_cluster, plot_2d_projection, plot_3d_projection, visualize_clusters_tsne


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dataset 1

In [63]:
dataset = 'dataset1'
experiment_name = 'experiment3'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

Number of features: 67


## Understand the optimal number of components from Experiment 2

In [7]:
# PCA
pca_metrics = load_pickle('results/dataset1/experiment2/pca_metrics.pkl')
# find the number of components that explains 95% of the variance
explained_variance = pca_metrics['cumulative_explained_variance'][0]
idx = np.where(explained_variance >= 0.95)
idx = idx[0][0]
print(f"Number of components that explains 95% of the variance: {idx}")

# ICA: find the elbow point of the kurtosis
ica_metrics = load_pickle('results/dataset1/experiment2/ica_metrics.pkl')
abs_mean_kurtosis = ica_metrics['abs_mean_kurtosis']
idx = find_elbow_indice(abs_mean_kurtosis)
print(f"Number of components that explains the elbow point of the kurtosis: {idx}")

# RP: find the elbow point of the reconstruction error
rp_metrics = load_pickle('results/dataset1/experiment2/rp_metrics.pkl')
reconstruction_error = rp_metrics['reconstruction_error_mean']
idx = find_elbow_indice(reconstruction_error)
print(f"Number of components that explains the elbow point of the reconstruction error: {idx}")

Number of components that explains 95% of the variance: 26
Number of components that explains the elbow point of the kurtosis: 16
Number of components that explains the elbow point of the reconstruction error: 3


## Run Experiment

In [7]:
combined_experiment = CombinedExperiment()
kmeans_metrics, em_metrics, transformed_data = combined_experiment.run_combined_analyis(
    X = X_train,
    dr_components=np.arange(22, n_features+1, 3),
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10]
)




In [8]:
# save
save_csv(kmeans_metrics, f'results/{dataset}/{experiment_name}', 'kmeans_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')
save_pickle(transformed_data, f'results/{dataset}/{experiment_name}', 'transformed_data')

Dataframe saved at results/dataset1/experiment3/kmeans_metrics.csv
Dataframe saved at results/dataset1/experiment3/em_metrics.csv
Data saved at results/dataset1/experiment3/transformed_data.pkl


## Result Analysis

In [9]:
np.arange(22, n_features+1, 3)

array([22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67])

In [26]:
kmeans_metrics = load_csv('results/dataset1/experiment3/kmeans_metrics.csv')
em_metrics = load_csv('results/dataset1/experiment3/em_metrics.csv')
transformed_data = load_pickle('results/dataset1/experiment3/transformed_data.pkl')

# find the optimal combination
optimal_combinations = find_optimal_combinations(kmeans_metrics, em_metrics)
optimal_combinations


{'ica_kmeans': {'dr_method': 'ica',
  'n_components': np.int64(61),
  'k': np.int64(2),
  'score': np.float64(0.59982926024279)},
 'pca_kmeans': {'dr_method': 'pca',
  'n_components': np.int64(22),
  'k': np.int64(2),
  'score': np.float64(0.2883938372274702)},
 'rp_kmeans': {'dr_method': 'rp',
  'n_components': np.int64(25),
  'k': np.int64(2),
  'score': np.float64(0.2782360637093942)},
 'ica_em': {'dr_method': 'ica',
  'n_components': np.int64(61),
  'k': np.int64(2),
  'score': np.float64(0.59982926024279)},
 'pca_em': {'dr_method': 'pca',
  'n_components': np.int64(22),
  'k': np.int64(2),
  'score': np.float64(0.2885203014062371)},
 'rp_em': {'dr_method': 'rp',
  'n_components': np.int64(25),
  'k': np.int64(2),
  'score': np.float64(0.2770328655134566)}}

#### Use the resultss from Experiment 2
n componeients for pca, ica, and rp are 25, 55, and 37

In [64]:
transformed_data = load_pickle('results/dataset1/experiment3/transformed_data.pkl')
# pca
pca_transformed_data = transformed_data['pca'][25]
ica_transformed_data = transformed_data['ica'][55]
rp_transformed_data = transformed_data['rp'][37]

config = [
    {'rd': 'pca', 'X': pca_transformed_data},
    {'rd': 'ica', 'X': ica_transformed_data},
    {'rd': 'rp', 'X': rp_transformed_data}
]

metrics, labels, external_metrics = run_combined_cluster(config, [2], y_train)


In [44]:
metrics

Unnamed: 0,k,silhouette_score,calinski_harabasz_score,method,rd_method
0,2,0.282563,14474.691194,kmeans,pca
1,2,0.282513,14473.416219,em,pca
2,2,0.583054,762.703704,kmeans,ica
3,2,0.583054,762.703704,em,ica
4,2,0.25116,11534.515714,kmeans,rp
5,2,0.241779,10939.99738,em,rp


In [42]:
external_metrics

Unnamed: 0,adjusted_rand,normalized_mutual_info,adjusted_mutual_info,homogeneity,completeness,method,rd_method
0,0.131564,0.080416,0.080393,0.112471,0.06258,kmeans,pca
1,0.131391,0.080286,0.080264,0.112295,0.062478,em,pca
2,0.016098,0.007303,0.007237,0.003831,0.077805,kmeans,ica
3,0.016098,0.007303,0.007237,0.003831,0.077805,em,ica
4,0.133033,0.08746,0.087438,0.123093,0.067826,kmeans,rp
5,0.127697,0.082334,0.082312,0.11594,0.063832,em,rp


In [65]:
save_csv(metrics, f'results/{dataset}/{experiment_name}', 'combined_metrics')
save_pickle(labels, f'results/{dataset}/{experiment_name}', 'combined_labels')
save_csv(external_metrics, f'results/{dataset}/{experiment_name}', 'combined_external_metrics')

Dataframe saved at results/dataset1/experiment3/combined_metrics.csv
Data saved at results/dataset1/experiment3/combined_labels.pkl
Dataframe saved at results/dataset1/experiment3/combined_external_metrics.csv


In [59]:
labels = load_pickle('results/dataset2/experiment3/combined_labels.pkl')


np.int32(27683)

In [22]:
metrics

Unnamed: 0,k,silhouette_score,calinski_harabasz_score,method,rd_method
0,2,0.282563,14474.691194,kmeans,pca
1,2,0.282513,14473.416219,em,pca
2,2,0.583054,762.703704,kmeans,ica
3,2,0.583054,762.703704,em,ica
4,2,0.25116,11534.515714,kmeans,rp
5,2,0.241779,10939.99738,em,rp


In [23]:
# plot
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = labels['pca']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_pca',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = labels['ica']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_ica',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = labels['rp']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_rp',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = labels['pca']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_pca',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = labels['ica']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_ica',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = labels['rp']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_rp',
    random_state=17,
    sample_size=2000
)


Plot saved at figs/dataset1/experiment3/tsne_visualization_kmeans_pca.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_kmeans_ica.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_kmeans_rp.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_em_pca.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_em_ica.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_em_rp.png


In [24]:
# true labels
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='pca_true',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='ica_true',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='rp_true',
    random_state=17,
    sample_size=2000
)


Plot saved at figs/dataset1/experiment3/tsne_visualization_pca_true.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_ica_true.png
Plot saved at figs/dataset1/experiment3/tsne_visualization_rp_true.png


# Dataset 2

In [67]:
dataset = 'dataset2'
experiment_name = 'experiment3'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

Number of features: 384


## Understand the optimal number of components from Experiment 2

In [8]:
# pca
pca_metrics = load_pickle('results/dataset2/experiment2/pca_metrics.pkl')
explained_variance = pca_metrics['cumulative_explained_variance'][0]
idx = np.where(explained_variance >= 0.95)
idx = idx[0][0]
print(f"Number of components that explains 95% of the variance: {idx}")

# ICA: find the elbow point of the kurtosis
ica_metrics = load_pickle('results/dataset2/experiment2/ica_metrics.pkl')
abs_mean_kurtosis = ica_metrics['abs_mean_kurtosis']
idx = find_elbow_indice(abs_mean_kurtosis)
print(f"Number of components that explains the elbow point of the kurtosis: {idx}")

# RP: find the elbow point of the reconstruction error
rp_metrics = load_pickle('results/dataset2/experiment2/rp_metrics.pkl')
reconstruction_error = rp_metrics['reconstruction_error_mean']
idx = find_elbow_indice(reconstruction_error)
print(f"Number of components that explains the elbow point of the reconstruction error: {idx}")

Number of components that explains 95% of the variance: 145
Number of components that explains the elbow point of the kurtosis: 33
Number of components that explains the elbow point of the reconstruction error: 2


## Run Experiment

In [11]:
combined_experiment = CombinedExperiment()
n_components_range = np.arange(40, 300, 20)
kmeans_metrics, em_metrics, transformed_data = combined_experiment.run_combined_analyis(
    X = X_train,
    dr_components=n_components_range,
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10]
)

# save
save_csv(kmeans_metrics, f'results/{dataset}/{experiment_name}', 'kmeans_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')
save_pickle(transformed_data, f'results/{dataset}/{experiment_name}', 'transformed_data')




Dataframe saved at results/dataset2/experiment3/kmeans_metrics.csv
Dataframe saved at results/dataset2/experiment3/em_metrics.csv
Data saved at results/dataset2/experiment3/transformed_data.pkl.pkl


## Result Analysis

In [31]:
kmeans_metrics = load_csv('results/dataset2/experiment3/kmeans_metrics.csv')
em_metrics = load_csv('results/dataset2/experiment3/em_metrics.csv')

# find the optimal combination
optimal_combinations = find_optimal_combinations(kmeans_metrics, em_metrics)
optimal_combinations

{'ica_kmeans': {'dr_method': 'ica',
  'n_components': np.int64(120),
  'k': np.int64(2),
  'score': np.float64(0.05200101)},
 'pca_kmeans': {'dr_method': 'pca',
  'n_components': np.int64(40),
  'k': np.int64(5),
  'score': np.float64(0.31027284)},
 'rp_kmeans': {'dr_method': 'rp',
  'n_components': np.int64(220),
  'k': np.int64(5),
  'score': np.float64(0.19503097)},
 'ica_em': {'dr_method': 'ica',
  'n_components': np.int64(120),
  'k': np.int64(2),
  'score': np.float64(0.05228002)},
 'pca_em': {'dr_method': 'pca',
  'n_components': np.int64(40),
  'k': np.int64(5),
  'score': np.float64(0.3104371)},
 'rp_em': {'dr_method': 'rp',
  'n_components': np.int64(220),
  'k': np.int64(5),
  'score': np.float64(0.19503097)}}

### Use the results from Experiment 2
n componeients for pca, ica, and rp are 145, 145,145

In [71]:
combined_experiment = CombinedExperiment()
n_components_range = [145]
kmeans_metrics, em_metrics, transformed_data = combined_experiment.run_combined_analyis(
    X = X_train,
    dr_components=n_components_range,
    k_range = [5]
)

In [72]:
pca_transformed_data = transformed_data['pca'][145]
ica_transformed_data = transformed_data['ica'][145]
rp_transformed_data = transformed_data['rp'][145]
config = [
    {'rd': 'pca', 'X': pca_transformed_data},
    {'rd': 'ica', 'X': ica_transformed_data},
    {'rd': 'rp', 'X': rp_transformed_data}
]
metrics, labels, external_metrics = run_combined_cluster(config, [5], y_train)
# save
save_csv(metrics, f'results/{dataset}/{experiment_name}', 'combined_metrics')
save_pickle(labels, f'results/{dataset}/{experiment_name}', 'combined_labels')
save_csv(external_metrics, f'results/{dataset}/{experiment_name}', 'combined_external_metrics')

Dataframe saved at results/dataset2/experiment3/combined_metrics.csv
Data saved at results/dataset2/experiment3/combined_labels.pkl
Dataframe saved at results/dataset2/experiment3/combined_external_metrics.csv


In [73]:
metrics

Unnamed: 0,k,silhouette_score,calinski_harabasz_score,method,rd_method
0,5,0.220943,305.840454,kmeans,pca
1,5,0.220943,305.840454,em,pca
2,5,-0.008717,8.926818,kmeans,ica
3,5,-0.00588,8.868317,em,ica
4,5,0.193176,274.370483,kmeans,rp
5,5,0.193176,274.370483,em,rp


In [74]:
external_metrics

Unnamed: 0,adjusted_rand,normalized_mutual_info,adjusted_mutual_info,homogeneity,completeness,method,rd_method
0,3.9e-05,9e-05,-0.000351,7.5e-05,0.000111,kmeans,pca
1,3.9e-05,9.1e-05,-0.000349,7.6e-05,0.000112,em,pca
2,0.017688,0.01341,0.01303,0.012876,0.01399,kmeans,ica
3,0.02236,0.018234,0.017847,0.017099,0.01953,em,ica
4,-0.000518,1e-05,-0.000365,1e-05,1e-05,kmeans,rp
5,-0.000513,6e-06,-0.00037,5e-06,6e-06,em,rp


In [57]:
# plot
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = labels['pca']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_pca',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = labels['ica']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_ica',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = labels['rp']['kmeans'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='kmeans_rp',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = labels['pca']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_pca',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = labels['ica']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_ica',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = labels['rp']['em'],
    dataset = dataset,
    experiment = experiment_name,
    algorithm='em_rp',
    random_state=17,
    sample_size=2000
)
# true labels
visualize_clusters_tsne(
    X = pca_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='pca_true',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = ica_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='ica_true',
    random_state=17,
    sample_size=2000
)
visualize_clusters_tsne(
    X = rp_transformed_data,
    labels = y_train,
    dataset = dataset,
    experiment = experiment_name,
    algorithm='rp_true',
    random_state=17,
    sample_size=2000
)

Plot saved at figs/dataset2/experiment3/tsne_visualization_kmeans_pca.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_kmeans_ica.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_kmeans_rp.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_em_pca.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_em_ica.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_em_rp.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_pca_true.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_ica_true.png
Plot saved at figs/dataset2/experiment3/tsne_visualization_rp_true.png
