In [1]:
import os
os.chdir('..')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [2]:
import pandas as pd
import numpy as np

In [10]:
%load_ext autoreload
%autoreload 2
from src.experiments.experiment3_combined import CombinedExperiment, find_optimal_combinations, evaluate_clustering
from src.utils.data_loader import load_processed_data, save_csv, load_csv, save_csv, save_pickle, load_pickle
from src.utils.evaluation import find_elbow_indice

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dataset 1

In [4]:
dataset = 'dataset1'
experiment_name = 'experiment3'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

Number of features: 30


## Understand the optimal number of components from Experiment 2

In [5]:
# PCA
pca_metrics = load_pickle('results/dataset1/experiment2/pca_metrics.pkl')
# find the number of components that explains 95% of the variance
explained_variance = pca_metrics['cumulative_explained_variance'][0]
idx = np.where(explained_variance >= 0.95)
idx = idx[0][0]
print(f"Number of components that explains 95% of the variance: {idx}")

# ICA: find the elbow point of the kurtosis
ica_metrics = load_pickle('results/dataset1/experiment2/ica_metrics.pkl')
abs_mean_kurtosis = ica_metrics['abs_mean_kurtosis']
idx = find_elbow_indice(abs_mean_kurtosis)
print(f"Number of components that explains the elbow point of the kurtosis: {idx}")

# RP: find the elbow point of the reconstruction error
rp_metrics = load_pickle('results/dataset1/experiment2/rp_metrics.pkl')
reconstruction_error = rp_metrics['reconstruction_error_mean']
idx = find_elbow_indice(reconstruction_error)
print(f"Number of components that explains the elbow point of the reconstruction error: {idx}")

Number of components that explains 95% of the variance: 10
Number of components that explains the elbow point of the kurtosis: 18
Number of components that explains the elbow point of the reconstruction error: 15


## Run Experiment

In [6]:
combined_experiment = CombinedExperiment()
kmeans_metrics, em_metrics, transformed_data = combined_experiment.run_combined_analyis(
    X = X_train,
    dr_components=[6, 8, 10, 12, 14, 16, 18, 20],
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10]
)


In [7]:
# save
save_csv(kmeans_metrics, f'results/{dataset}/{experiment_name}', 'kmeans_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')
save_pickle(transformed_data, f'results/{dataset}/{experiment_name}', 'transformed_data')

Dataframe saved at results/dataset1/experiment3/kmeans_metrics.csv
Dataframe saved at results/dataset1/experiment3/em_metrics.csv
Data saved at results/dataset1/experiment3/transformed_data.pkl


## Result Analysis

In [5]:
kmeans_metrics = load_csv('results/dataset1/experiment3/kmeans_metrics.csv')
em_metrics = load_csv('results/dataset1/experiment3/em_metrics.csv')

# find the optimal combination
optimal_combinations = find_optimal_combinations(kmeans_metrics, em_metrics)
optimal_combinations


{'ica_kmeans': {'dr_method': 'ica',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.2728713998359047)},
 'pca_kmeans': {'dr_method': 'pca',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.4343992805784901)},
 'rp_kmeans': {'dr_method': 'rp',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.532047658972778)},
 'ica_em': {'dr_method': 'ica',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.2499317320511711)},
 'pca_em': {'dr_method': 'pca',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.4337397202912334)},
 'rp_em': {'dr_method': 'rp',
  'n_components': np.int64(6),
  'k': np.int64(2),
  'score': np.float64(0.4150771849689905)}}

### Supervised metrics

In [11]:
transformed_data = load_pickle('results/dataset1/experiment3/transformed_data.pkl')
results = evaluate_clustering(transformed_data, y_train, optimal_combinations)

In [12]:
results

{'ica_kmeans': {'metrics': {'adjusted_rand': 0.1514352733975298,
   'normalized_mutual_info': np.float64(0.0415134595551896),
   'adjusted_mutual_info': np.float64(0.0414827992464123),
   'homogeneity': np.float64(0.04476598846669214),
   'completeness': np.float64(0.03870155000878906),
   'v_measure': np.float64(0.0415134595551896)},
  'composition': col_0         -1          1
  row_0                      
  0      91.275339   8.724661
  1      73.285199  26.714801},
 'pca_kmeans': {'metrics': {'adjusted_rand': 0.11591219516909969,
   'normalized_mutual_info': np.float64(0.07422293225037535),
   'adjusted_mutual_info': np.float64(0.07420035997478884),
   'homogeneity': np.float64(0.10499174589658755),
   'completeness': np.float64(0.057401024780501825),
   'v_measure': np.float64(0.07422293225037535)},
  'composition': col_0         -1          1
  row_0                      
  0      76.543993  23.456007
  1      95.137757   4.862243},
 'rp_kmeans': {'metrics': {'adjusted_rand': 0.1

# Dataset 2

In [15]:
dataset = 'dataset2'
experiment_name = 'experiment3'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

Number of features: 384


## Understand the optimal number of components from Experiment 2

In [10]:
# pca
pca_metrics = load_pickle('results/dataset2/experiment2/pca_metrics.pkl')
explained_variance = pca_metrics['cumulative_explained_variance'][0]
idx = np.where(explained_variance >= 0.95)
idx = idx[0][0]
print(f"Number of components that explains 95% of the variance: {idx}")

# ICA: find the elbow point of the kurtosis
ica_metrics = load_pickle('results/dataset2/experiment2/ica_metrics.pkl')
abs_mean_kurtosis = ica_metrics['abs_mean_kurtosis']
idx = find_elbow_indice(abs_mean_kurtosis)
print(f"Number of components that explains the elbow point of the kurtosis: {idx}")

# RP: find the elbow point of the reconstruction error
rp_metrics = load_pickle('results/dataset2/experiment2/rp_metrics.pkl')
reconstruction_error = rp_metrics['reconstruction_error_mean']
idx = find_elbow_indice(reconstruction_error)
print(f"Number of components that explains the elbow point of the reconstruction error: {idx}")

Number of components that explains 95% of the variance: 145
Number of components that explains the elbow point of the kurtosis: 7
Number of components that explains the elbow point of the reconstruction error: 27


## Run Experiment

In [11]:
combined_experiment = CombinedExperiment()
n_components_range = np.arange(40, 300, 20)
kmeans_metrics, em_metrics, transformed_data = combined_experiment.run_combined_analyis(
    X = X_train,
    dr_components=n_components_range,
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10]
)

# save
save_csv(kmeans_metrics, f'results/{dataset}/{experiment_name}', 'kmeans_metrics')
save_csv(em_metrics, f'results/{dataset}/{experiment_name}', 'em_metrics')
save_pickle(transformed_data, f'results/{dataset}/{experiment_name}', 'transformed_data')




Dataframe saved at results/dataset2/experiment3/kmeans_metrics.csv
Dataframe saved at results/dataset2/experiment3/em_metrics.csv
Data saved at results/dataset2/experiment3/transformed_data.pkl.pkl


## Result Analysis

In [31]:
kmeans_metrics = load_csv('results/dataset2/experiment3/kmeans_metrics.csv')
em_metrics = load_csv('results/dataset2/experiment3/em_metrics.csv')

# find the optimal combination
optimal_combinations = find_optimal_combinations(kmeans_metrics, em_metrics)
optimal_combinations

{'ica_kmeans': {'dr_method': 'ica',
  'n_components': np.int64(120),
  'k': np.int64(2),
  'score': np.float64(0.05200101)},
 'pca_kmeans': {'dr_method': 'pca',
  'n_components': np.int64(40),
  'k': np.int64(5),
  'score': np.float64(0.31027284)},
 'rp_kmeans': {'dr_method': 'rp',
  'n_components': np.int64(220),
  'k': np.int64(5),
  'score': np.float64(0.19503097)},
 'ica_em': {'dr_method': 'ica',
  'n_components': np.int64(120),
  'k': np.int64(2),
  'score': np.float64(0.05228002)},
 'pca_em': {'dr_method': 'pca',
  'n_components': np.int64(40),
  'k': np.int64(5),
  'score': np.float64(0.3104371)},
 'rp_em': {'dr_method': 'rp',
  'n_components': np.int64(220),
  'k': np.int64(5),
  'score': np.float64(0.19503097)}}

### Supervised metrics

In [33]:
transformed_data = load_pickle('results/dataset2/experiment3/transformed_data.pkl')
results = evaluate_clustering(transformed_data, y_train, optimal_combinations)

results

{'ica_kmeans': {'metrics': {'adjusted_rand': 0.004989518574043607,
   'normalized_mutual_info': np.float64(0.009313430227058231),
   'adjusted_mutual_info': np.float64(0.008901283402351216),
   'homogeneity': np.float64(0.008273861193597298),
   'completeness': np.float64(0.010651771760000447),
   'v_measure': np.float64(0.00931343022705823)},
  'composition': col_0          0          1
  row_0                      
  0      51.525199  48.474801
  1      38.839286  61.160714},
 'pca_kmeans': {'metrics': {'adjusted_rand': 0.0008325987914871675,
   'normalized_mutual_info': np.float64(0.0016507411041766454),
   'adjusted_mutual_info': np.float64(0.0007595338577092207),
   'homogeneity': np.float64(0.0027360302425331793),
   'completeness': np.float64(0.0011819157556436305),
   'v_measure': np.float64(0.0016507411041766457)},
  'composition': col_0          0          1
  row_0                      
  0      45.413870  54.586130
  1      46.118721  53.881279
  2      50.000000  50.000000