In [1]:
import os
os.chdir('../')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [34]:
%load_ext autoreload
%autoreload 2
from src.utils.data_loader import load_processed_data, save_csv, save_pickle, load_pickle, save_json, load_json
from src.utils.plotting import plot_metrics_vs_cluster, plot_2d_projection, plot_3d_projection
from src.experiments.experiment2_dimensionality import DimensionalityReductionExperiment
from src.utils.evaluation import find_elbow_indice


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import pandas as pd
import numpy as np

# DataSet 1
## Load data set

In [4]:
dataset = 'dataset1'
experiment_name = 'experiment2'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)

## Run experiment

In [10]:
dr_experiment = DimensionalityReductionExperiment()

In [11]:
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

n_components_range = np.arange(10, n_features + 1, 3)
print(f"Number of components: {n_components_range}")

Number of features: 67
Number of components: [10 13 16 19 22 25 28 31 34 37 40 43 46 49 52 55 58 61 64 67]


### PCA

In [12]:
pca_metrics, pca_transformed = dr_experiment.run_pca_analysis(X_train, [n_features])
# save
save_pickle(pca_metrics, f'results/{dataset}/{experiment_name}', 'pca_metrics')
save_pickle(pca_transformed, f'results/{dataset}/{experiment_name}', 'pca_transformed')


Data saved at results/dataset1/experiment2/pca_metrics.pkl
Data saved at results/dataset1/experiment2/pca_transformed.pkl


In [22]:
explained_variance = pca_metrics['cumulative_explained_variance'][0]
idx = np.where(explained_variance >= 0.95)
idx = idx[0][0]
print(f"Number of components that explains 95% of the variance: {idx}")

Number of components that explains 95% of the variance: 26


In [15]:
pca_metrics_df = pd.DataFrame(
    {
        'n_components': range(1, n_features + 1),
        'cumulative_explained_variance': pca_metrics['cumulative_explained_variance'][0]
    }
)
plot_metrics_vs_cluster(
    df=pca_metrics_df,
    metric_col='cumulative_explained_variance',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    y_label='Cumulative Var. Ratio',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=2000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/cumulative_explained_variance_vs_n_components_pca.png
Plot saved at figs/dataset1/experiment2/2d_projection_pca.png
Plot saved at figs/dataset1/experiment2/3d_projection_pca.png


### ICA

In [16]:
ica_metrics, ica_transformed = dr_experiment.run_ica_analysis(X_train, n_components_range)
# save
save_pickle(ica_metrics, f'results/{dataset}/{experiment_name}', 'ica_metrics')
save_pickle(ica_transformed, f'results/{dataset}/{experiment_name}', 'ica_transformed')



Data saved at results/dataset1/experiment2/ica_metrics.pkl
Data saved at results/dataset1/experiment2/ica_transformed.pkl


In [24]:
abs_mean_kurtosis = ica_metrics['abs_mean_kurtosis']
idx = find_elbow_indice(abs_mean_kurtosis)
print(f"Number of components that explains the elbow point of the kurtosis: {idx}")

Number of components that explains the elbow point of the kurtosis: 16


In [21]:
# Plot ICA metrics
plot_metrics_vs_cluster(
    df=ica_metrics,
    metric_col='abs_mean_kurtosis',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica', 
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=ica_transformed.get(55),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica',
    sample_size=2000
)

# plot 3d projection
plot_3d_projection(
    X_transformed=ica_transformed.get(55),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/abs_mean_kurtosis_vs_n_components_ica.png
Plot saved at figs/dataset1/experiment2/2d_projection_ica.png
Plot saved at figs/dataset1/experiment2/3d_projection_ica.png


### RP

In [18]:
rp_metrics, rp_transformed = dr_experiment.run_rp_analysis(X_train, n_components_range)
# save
save_pickle(rp_metrics, f'results/{dataset}/{experiment_name}', 'rp_metrics')
save_pickle(rp_transformed, f'results/{dataset}/{experiment_name}', 'rp_transformed')

Data saved at results/dataset1/experiment2/rp_metrics.pkl
Data saved at results/dataset1/experiment2/rp_transformed.pkl


In [27]:
# RP: find the elbow point of the reconstruction error
rp_metrics = load_pickle('results/dataset1/experiment2/rp_metrics.pkl')
reconstruction_error = rp_metrics['reconstruction_error_mean']
idx = find_elbow_indice(reconstruction_error)
print(f"Number of components that explains the elbow point of the reconstruction error: {idx}")

Number of components that explains the elbow point of the reconstruction error: 3


In [28]:
# Plot RP metrics
plot_metrics_vs_cluster(
    df=rp_metrics,
    metric_col='reconstruction_error_mean',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=rp_transformed.get(49),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=2000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=rp_transformed.get(49),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/reconstruction_error_mean_vs_n_components_rp.png
Plot saved at figs/dataset1/experiment2/2d_projection_rp.png
Plot saved at figs/dataset1/experiment2/3d_projection_rp.png


### UMAP

In [35]:
umap_metrics, umap_transformed = dr_experiment.run_umap_analysis(X_train, [20])
# save
save_pickle(umap_metrics, f'results/{dataset}/{experiment_name}', 'umap_metrics')
save_pickle(umap_transformed, f'results/{dataset}/{experiment_name}', 'umap_transformed')

KeyboardInterrupt: 

In [31]:
# plot
plot_metrics_vs_cluster(
    df=umap_metrics,
    metric_col='trustworthiness',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='umap',
    x_label='Number of Components'
)

# plot 2d projection
plot_2d_projection(
    X_transformed=umap_transformed.get(20),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='umap',
    sample_size=2000
)

# plot 3d projection
plot_3d_projection(
    X_transformed=umap_transformed.get(20),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='umap',
    sample_size=1000
)

NameError: name 'umap_metrics' is not defined

# DataSet 2
## Load data set

In [6]:
dataset = 'dataset2'
experiment_name = 'experiment2'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)

## Run experiment

In [16]:
dr_experiment = DimensionalityReductionExperiment()


In [17]:
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

n_components_range = np.arange(10, n_features + 1, 10)
print(f"Number of components: {n_components_range}")

Number of features: 384
Number of components: [ 10  20  30  40  50  60  70  80  90 100 110 120 130 140 150 160 170 180
 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360
 370 380]


### PCA

In [18]:
pca_metrics, pca_transformed = dr_experiment.run_pca_analysis(X_train, [n_features])
# save
save_pickle(pca_metrics, f'results/{dataset}/{experiment_name}', 'pca_metrics')
save_pickle(pca_transformed, f'results/{dataset}/{experiment_name}', 'pca_transformed')
pca_metrics_df = pd.DataFrame(
    {
        'n_components': range(1, n_features + 1),
        'cumulative_explained_variance': pca_metrics['cumulative_explained_variance'][0]
    }
)
plot_metrics_vs_cluster(
    df=pca_metrics_df,
    metric_col='cumulative_explained_variance',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    y_label='Cumulative Var. Ratio',
    x_label='Number of Components'
)

# plot 2d projection
plot_2d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)

Data saved at results/dataset2/experiment2/pca_metrics.pkl
Data saved at results/dataset2/experiment2/pca_transformed.pkl
Plot saved at figs/dataset2/experiment2/cumulative_explained_variance_vs_n_components_pca.png
Plot saved at figs/dataset2/experiment2/2d_projection_pca.png
Plot saved at figs/dataset2/experiment2/3d_projection_pca.png


### ICA

In [None]:
ica_metrics, ica_transformed = dr_experiment.run_ica_analysis(X_train, n_components_range)
# save
save_pickle(ica_metrics, f'results/{dataset}/{experiment_name}', 'ica_metrics')
save_pickle(ica_transformed, f'results/{dataset}/{experiment_name}', 'ica_transformed')

In [12]:
ica_metrics = load_pickle('results/dataset2/experiment2/ica_metrics.pkl')
ica_metrics

Unnamed: 0,n_components,kurtosis_values,mean_kurtosis,abs_mean_kurtosis,explained_variance_ratio,cumulative_explained_variance,n_iter
0,10,"[0.4567809, 3.51343, 0.13892603, -0.043465853,...",1.643678,1.799506,"[0.10000000894069672, 0.10000000149011612, 0.0...","[0.10000000894069672, 0.20000001788139343, 0.3...",1000
1,20,"[2.564324, 6.7196264, 1.3655548, 4.278918, 2.8...",2.812381,2.903326,"[0.05000000447034836, 0.05000000447034836, 0.0...","[0.05000000447034836, 0.10000000894069672, 0.1...",1000
2,30,"[2.0930786, 3.9649053, 5.6955805, 1.5950851, 3...",3.721171,3.749579,"[0.03333333507180214, 0.03333333879709244, 0.0...","[0.03333333507180214, 0.06666667759418488, 0.1...",1000
3,40,"[-0.63864374, 2.0598216, 7.9935474, 2.353959, ...",5.643517,5.675448,"[0.02499999850988388, 0.02500000037252903, 0.0...","[0.02499999850988388, 0.04999999701976776, 0.0...",1000
4,50,"[3.0914626, 8.844305, 5.6800814, 1.8785439, 5....",6.034774,6.034774,"[0.019999997690320015, 0.019999999552965164, 0...","[0.019999997690320015, 0.03999999910593033, 0....",78
5,60,"[3.7257648, 5.452957, 1.1105886, 9.522581, 2.3...",7.181997,7.181997,"[0.01666666753590107, 0.01666666753590107, 0.0...","[0.01666666753590107, 0.03333333507180214, 0.0...",96
6,70,"[3.4515314, 7.0619745, 5.1533566, 9.009675, 2....",7.703835,7.703835,"[0.014285714365541935, 0.014285716228187084, 0...","[0.014285714365541935, 0.02857143059372902, 0....",106
7,80,"[4.6960936, 9.882365, 9.569339, 5.105257, 2.89...",8.22998,8.22998,"[0.01250000111758709, 0.012500000186264515, 0....","[0.01250000111758709, 0.02500000223517418, 0.0...",143
8,90,"[7.0119057, 5.4753866, 3.2037406, 3.2083664, 4...",9.072213,9.072213,"[0.01111111231148243, 0.01111111231148243, 0.0...","[0.01111111231148243, 0.02222222462296486, 0.0...",163
9,100,"[12.291693, 2.8731594, 2.517136, 3.1923223, 9....",9.767244,9.767244,"[0.009999999776482582, 0.009999999776482582, 0...","[0.009999999776482582, 0.019999999552965164, 0...",137


In [17]:
def _count_kurtosis(kursttosis: np.ndarray, threshold=3) -> int:
    return np.sum(np.abs(kursttosis) > threshold)


ica_metrics['count'] = ica_metrics['kurtosis_values'].apply(lambda x: _count_kurtosis(x, 3))
ica_metrics

Unnamed: 0,n_components,kurtosis_values,mean_kurtosis,abs_mean_kurtosis,explained_variance_ratio,cumulative_explained_variance,n_iter,count
0,10,"[0.4567809, 3.51343, 0.13892603, -0.043465853,...",1.643678,1.799506,"[0.10000000894069672, 0.10000000149011612, 0.0...","[0.10000000894069672, 0.20000001788139343, 0.3...",1000,2
1,20,"[2.564324, 6.7196264, 1.3655548, 4.278918, 2.8...",2.812381,2.903326,"[0.05000000447034836, 0.05000000447034836, 0.0...","[0.05000000447034836, 0.10000000894069672, 0.1...",1000,10
2,30,"[2.0930786, 3.9649053, 5.6955805, 1.5950851, 3...",3.721171,3.749579,"[0.03333333507180214, 0.03333333879709244, 0.0...","[0.03333333507180214, 0.06666667759418488, 0.1...",1000,17
3,40,"[-0.63864374, 2.0598216, 7.9935474, 2.353959, ...",5.643517,5.675448,"[0.02499999850988388, 0.02500000037252903, 0.0...","[0.02499999850988388, 0.04999999701976776, 0.0...",1000,21
4,50,"[3.0914626, 8.844305, 5.6800814, 1.8785439, 5....",6.034774,6.034774,"[0.019999997690320015, 0.019999999552965164, 0...","[0.019999997690320015, 0.03999999910593033, 0....",78,29
5,60,"[3.7257648, 5.452957, 1.1105886, 9.522581, 2.3...",7.181997,7.181997,"[0.01666666753590107, 0.01666666753590107, 0.0...","[0.01666666753590107, 0.03333333507180214, 0.0...",96,41
6,70,"[3.4515314, 7.0619745, 5.1533566, 9.009675, 2....",7.703835,7.703835,"[0.014285714365541935, 0.014285716228187084, 0...","[0.014285714365541935, 0.02857143059372902, 0....",106,56
7,80,"[4.6960936, 9.882365, 9.569339, 5.105257, 2.89...",8.22998,8.22998,"[0.01250000111758709, 0.012500000186264515, 0....","[0.01250000111758709, 0.02500000223517418, 0.0...",143,65
8,90,"[7.0119057, 5.4753866, 3.2037406, 3.2083664, 4...",9.072213,9.072213,"[0.01111111231148243, 0.01111111231148243, 0.0...","[0.01111111231148243, 0.02222222462296486, 0.0...",163,77
9,100,"[12.291693, 2.8731594, 2.517136, 3.1923223, 9....",9.767244,9.767244,"[0.009999999776482582, 0.009999999776482582, 0...","[0.009999999776482582, 0.019999999552965164, 0...",137,91


In [21]:
# Plot ICA metrics
plot_metrics_vs_cluster(
    df=ica_metrics,
    metric_col='abs_mean_kurtosis',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica', 
    x_label='Number of Components'
)

Plot saved at figs/dataset2/experiment2/abs_mean_kurtosis_vs_n_components_ica.png


### RP

In [22]:
rp_metrics, rp_transformed = dr_experiment.run_rp_analysis(X_train, n_components_range)
# save
save_pickle(rp_metrics, f'results/{dataset}/{experiment_name}', 'rp_metrics')
save_pickle(rp_transformed, f'results/{dataset}/{experiment_name}', 'rp_transformed')

Data saved at results/dataset2/experiment2/rp_metrics.pkl
Data saved at results/dataset2/experiment2/rp_transformed.pkl


In [23]:
# Plot RP metrics
plot_metrics_vs_cluster(
    df=rp_metrics,
    metric_col='reconstruction_error_mean',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=rp_transformed.get(30),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=rp_transformed.get(30),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)

Plot saved at figs/dataset2/experiment2/reconstruction_error_mean_vs_n_components_rp.png
Plot saved at figs/dataset2/experiment2/2d_projection_rp.png
Plot saved at figs/dataset2/experiment2/3d_projection_rp.png
