In [1]:
import os
os.chdir('../')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/UnsupervisedLearning


In [2]:
%load_ext autoreload
%autoreload 2
from src.utils.data_loader import load_processed_data, save_csv, save_pickle, load_pickle, save_json, load_json
from src.utils.plotting import plot_metrics_vs_cluster, plot_2d_projection, plot_3d_projection
from src.experiments.experiment2_dimensionality import DimensionalityReductionExperiment


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# DataSet 1
## Load data set

In [4]:
dataset = 'dataset1'
experiment_name = 'experiment2'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)

## Run experiment

In [5]:
dr_experiment = DimensionalityReductionExperiment()

In [6]:
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

n_components_range = np.arange(1, n_features + 1)
print(f"Number of components: {n_components_range}")

Number of features: 30
Number of components: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30]


### PCA

In [7]:
pca_metrics, pca_transformed = dr_experiment.run_pca_analysis(X_train, [n_features])
# save
save_pickle(pca_metrics, f'results/{dataset}/{experiment_name}', 'pca_metrics')
save_pickle(pca_transformed, f'results/{dataset}/{experiment_name}', 'pca_transformed')


Data saved at results/dataset1/experiment2/pca_metrics.pkl
Data saved at results/dataset1/experiment2/pca_transformed.pkl


In [8]:
pca_metrics

Unnamed: 0,n_components,explained_variance_ratio,cumulative_explained_variance,singular_values
0,30,"[0.35945698162449585, 0.10581679635307847, 0.0...","[0.35945698162449585, 0.4652737779775743, 0.56...","[402.27445188961894, 218.26107738561186, 208.6..."


In [9]:
pca_metrics_df = pd.DataFrame(
    {
        'n_components': range(1, n_features + 1),
        'cumulative_explained_variance': pca_metrics['cumulative_explained_variance'][0]
    }
)
plot_metrics_vs_cluster(
    df=pca_metrics_df,
    metric_col='cumulative_explained_variance',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    y_label='Cumulative Var. Ratio',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=2000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/cumulative_explained_variance_vs_n_components_pca.png
Plot saved at figs/dataset1/experiment2/2d_projection_pca.png
Plot saved at figs/dataset1/experiment2/3d_projection_pca.png


### ICA

In [None]:
ica_metrics, ica_transformed = dr_experiment.run_ica_analysis(X_train, n_components_range)
# save
save_pickle(ica_metrics, f'results/{dataset}/{experiment_name}', 'ica_metrics')
save_pickle(ica_transformed, f'results/{dataset}/{experiment_name}', 'ica_transformed')

In [11]:
# Plot ICA metrics
plot_metrics_vs_cluster(
    df=ica_metrics,
    metric_col='abs_mean_kurtosis',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica', 
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=ica_transformed.get(16),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica',
    sample_size=2000
)

# plot 3d projection
plot_3d_projection(
    X_transformed=ica_transformed.get(16),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/abs_mean_kurtosis_vs_n_components_ica.png
Plot saved at figs/dataset1/experiment2/2d_projection_ica.png
Plot saved at figs/dataset1/experiment2/3d_projection_ica.png


### RP

In [12]:
rp_metrics, rp_transformed = dr_experiment.run_rp_analysis(X_train, n_components_range)
# save
save_pickle(rp_metrics, f'results/{dataset}/{experiment_name}', 'rp_metrics')
save_pickle(rp_transformed, f'results/{dataset}/{experiment_name}', 'rp_transformed')

Data saved at results/dataset1/experiment2/rp_metrics.pkl
Data saved at results/dataset1/experiment2/rp_transformed.pkl


In [13]:
# Plot RP metrics
plot_metrics_vs_cluster(
    df=rp_metrics,
    metric_col='reconstruction_error_mean',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=rp_transformed.get(24),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=2000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=rp_transformed.get(24),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)

Plot saved at figs/dataset1/experiment2/reconstruction_error_mean_vs_n_components_rp.png
Plot saved at figs/dataset1/experiment2/2d_projection_rp.png
Plot saved at figs/dataset1/experiment2/3d_projection_rp.png


In [14]:
rp_metrics

Unnamed: 0,n_components,reconstruction_error_mean,reconstruction_error_std,components_rank
0,1,12.736701,0.0,1.0
1,2,5.477891,0.0,2.0
2,3,10.002844,0.0,3.0
3,4,7.946015,0.0,4.0
4,5,5.726547,0.0,5.0
5,6,3.96407,0.0,6.0
6,7,3.169823,0.0,7.0
7,8,2.617806,0.0,8.0
8,9,2.153437,0.0,9.0
9,10,2.109402,0.0,10.0


# DataSet 2
## Load data set

In [15]:
dataset = 'dataset2'
experiment_name = 'experiment2'
X_train, X_test,y_train, y_test = load_processed_data(f'data/{dataset}')
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)

## Run experiment

In [16]:
dr_experiment = DimensionalityReductionExperiment()


In [17]:
n_features = X_train.shape[1]
print(f"Number of features: {n_features}")

n_components_range = np.arange(10, n_features + 1, 10)
print(f"Number of components: {n_components_range}")

Number of features: 384
Number of components: [ 10  20  30  40  50  60  70  80  90 100 110 120 130 140 150 160 170 180
 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360
 370 380]


### PCA

In [18]:
pca_metrics, pca_transformed = dr_experiment.run_pca_analysis(X_train, [n_features])
# save
save_pickle(pca_metrics, f'results/{dataset}/{experiment_name}', 'pca_metrics')
save_pickle(pca_transformed, f'results/{dataset}/{experiment_name}', 'pca_transformed')
pca_metrics_df = pd.DataFrame(
    {
        'n_components': range(1, n_features + 1),
        'cumulative_explained_variance': pca_metrics['cumulative_explained_variance'][0]
    }
)
plot_metrics_vs_cluster(
    df=pca_metrics_df,
    metric_col='cumulative_explained_variance',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    y_label='Cumulative Var. Ratio',
    x_label='Number of Components'
)

# plot 2d projection
plot_2d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=pca_transformed.get(n_features),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='pca',
    sample_size=1000
)

Data saved at results/dataset2/experiment2/pca_metrics.pkl
Data saved at results/dataset2/experiment2/pca_transformed.pkl
Plot saved at figs/dataset2/experiment2/cumulative_explained_variance_vs_n_components_pca.png
Plot saved at figs/dataset2/experiment2/2d_projection_pca.png
Plot saved at figs/dataset2/experiment2/3d_projection_pca.png


### ICA

In [19]:
ica_metrics, ica_transformed = dr_experiment.run_ica_analysis(X_train, n_components_range)
# save
save_pickle(ica_metrics, f'results/{dataset}/{experiment_name}', 'ica_metrics')
save_pickle(ica_transformed, f'results/{dataset}/{experiment_name}', 'ica_transformed')



Data saved at results/dataset2/experiment2/ica_metrics.pkl
Data saved at results/dataset2/experiment2/ica_transformed.pkl


In [20]:
ica_metrics

Unnamed: 0,n_components,kurtosis_values,mean_kurtosis,abs_mean_kurtosis,explained_variance_ratio,cumulative_explained_variance,n_iter
0,10,"[0.4567809, 3.51343, 0.13892603, -0.043465853,...",1.643678,1.799506,"[0.10000000894069672, 0.10000000149011612, 0.0...","[0.10000000894069672, 0.20000001788139343, 0.3...",1000
1,20,"[2.564324, 6.7196264, 1.3655548, 4.278918, 2.8...",2.812381,2.903326,"[0.05000000447034836, 0.05000000447034836, 0.0...","[0.05000000447034836, 0.10000000894069672, 0.1...",1000
2,30,"[2.0930786, 3.9649053, 5.6955805, 1.5950851, 3...",3.721171,3.749579,"[0.03333333507180214, 0.03333333879709244, 0.0...","[0.03333333507180214, 0.06666667759418488, 0.1...",1000
3,40,"[-0.63864374, 2.0598216, 7.9935474, 2.353959, ...",5.643517,5.675448,"[0.02499999850988388, 0.02500000037252903, 0.0...","[0.02499999850988388, 0.04999999701976776, 0.0...",1000
4,50,"[3.0914626, 8.844305, 5.6800814, 1.8785439, 5....",6.034774,6.034774,"[0.019999997690320015, 0.019999999552965164, 0...","[0.019999997690320015, 0.03999999910593033, 0....",78
5,60,"[3.7257648, 5.452957, 1.1105886, 9.522581, 2.3...",7.181997,7.181997,"[0.01666666753590107, 0.01666666753590107, 0.0...","[0.01666666753590107, 0.03333333507180214, 0.0...",96
6,70,"[3.4515314, 7.0619745, 5.1533566, 9.009675, 2....",7.703835,7.703835,"[0.014285714365541935, 0.014285716228187084, 0...","[0.014285714365541935, 0.02857143059372902, 0....",106
7,80,"[4.6960936, 9.882365, 9.569339, 5.105257, 2.89...",8.22998,8.22998,"[0.01250000111758709, 0.012500000186264515, 0....","[0.01250000111758709, 0.02500000223517418, 0.0...",143
8,90,"[7.0119057, 5.4753866, 3.2037406, 3.2083664, 4...",9.072213,9.072213,"[0.01111111231148243, 0.01111111231148243, 0.0...","[0.01111111231148243, 0.02222222462296486, 0.0...",163
9,100,"[12.291693, 2.8731594, 2.517136, 3.1923223, 9....",9.767244,9.767244,"[0.009999999776482582, 0.009999999776482582, 0...","[0.009999999776482582, 0.019999999552965164, 0...",137


In [21]:
# Plot ICA metrics
plot_metrics_vs_cluster(
    df=ica_metrics,
    metric_col='abs_mean_kurtosis',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='ica', 
    x_label='Number of Components'
)

Plot saved at figs/dataset2/experiment2/abs_mean_kurtosis_vs_n_components_ica.png


### RP

In [22]:
rp_metrics, rp_transformed = dr_experiment.run_rp_analysis(X_train, n_components_range)
# save
save_pickle(rp_metrics, f'results/{dataset}/{experiment_name}', 'rp_metrics')
save_pickle(rp_transformed, f'results/{dataset}/{experiment_name}', 'rp_transformed')

Data saved at results/dataset2/experiment2/rp_metrics.pkl
Data saved at results/dataset2/experiment2/rp_transformed.pkl


In [23]:
# Plot RP metrics
plot_metrics_vs_cluster(
    df=rp_metrics,
    metric_col='reconstruction_error_mean',
    k_col='n_components',
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    x_label='Number of Components'
)
# plot 2d projection
plot_2d_projection(
    X_transformed=rp_transformed.get(30),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)
# plot 3d projection
plot_3d_projection(
    X_transformed=rp_transformed.get(30),
    y=y_train,
    dataset=dataset,
    experiment=experiment_name,
    algo_name='rp',
    sample_size=1000
)

Plot saved at figs/dataset2/experiment2/reconstruction_error_mean_vs_n_components_rp.png
Plot saved at figs/dataset2/experiment2/2d_projection_rp.png
Plot saved at figs/dataset2/experiment2/3d_projection_rp.png
