In [1]:
import glob

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FastICA as ICA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import normalized_mutual_info_score as NMI
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture as GMM
from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_classif
from sklearn.random_projection import GaussianRandomProjection as GRP

%matplotlib inline

In [2]:
df_main = pd.read_csv("./assets/cleaned_student_data.csv")
X_student = df_main.iloc[:, :-1].astype('float')
y_student = df_main.iloc[:, -1]

In [3]:
df_main = pd.read_csv("./assets/cleaned_housing_data.csv")
X_housing = df_main.iloc[:, :-1].astype('float')
y_housing = df_main.iloc[:, -1]
re_label = dict(zip(y_housing.unique()[::-1], range(y_housing.unique().size)))  # convert 'Group x' to int
y_housing = y_housing.replace(re_label) 

In [4]:
def benchmark_decorator_km(func):
    def benchmark(X, y, comp, n_class, title, n_iterations):
        df_results = pd.DataFrame(index=range(n_iterations), columns=['acc', 'nmi'])
        
        for iteration in range(n_iterations):
            X_trans = func(X, comp)
            preds = KMeans(n_class, n_jobs=-1).fit_predict(X_trans)
            acc = accuracy_score(preds, y)
            nmi = NMI(preds, y)
            
            df_results.loc[iteration, 'acc'] = acc
            df_results.loc[iteration, 'nmi'] = nmi
        
        mean = df_results.mean(0)
        std = df_results.std(0)
        df_stats = pd.DataFrame(index=['mean', 'std'],
                                columns=['acc', 'nmi'],
                                data=np.vstack([mean, std]))
        
        df_results = pd.concat([df_results, df_stats], axis=0)
        df_results.index.name = "N_components: " + title
        return df_results
    return benchmark

def benchmark_decorator_km_fs(func):
    def benchmark(X, y, comp, n_class, title, n_iterations):
        df_results = pd.DataFrame(index=range(n_iterations), columns=['acc', 'nmi'])
        
        for iteration in range(n_iterations):
            X_trans = func(X, y, comp)
            preds = KMeans(n_class, n_jobs=-1).fit_predict(X_trans)
            acc = accuracy_score(preds, y)
            nmi = NMI(preds, y)
            
            df_results.loc[iteration, 'acc'] = acc
            df_results.loc[iteration, 'nmi'] = nmi
        
        mean = df_results.mean(0)
        std = df_results.std(0)
        df_stats = pd.DataFrame(index=['mean', 'std'],
                                columns=['acc', 'nmi'],
                                data=np.vstack([mean, std]))
        
        df_results = pd.concat([df_results, df_stats], axis=0)
        df_results.index.name = "N_components: " + title
        return df_results
    return benchmark

def benchmark_decorator_em(func):
    def benchmark(X, y, comp, n_class, title, n_iterations):
        df_results = pd.DataFrame(index=range(n_iterations), columns=['acc', 'nmi'])
        
        for iteration in range(n_iterations):
            X_trans = func(X, comp)
            gmm = GMM(n_class)
            gmm.fit(X_trans)
            preds = gmm.predict(X_trans)
            acc = accuracy_score(preds, y)
            nmi = NMI(preds, y)
            
            df_results.loc[iteration, 'acc'] = acc
            df_results.loc[iteration, 'nmi'] = nmi
        
        mean = df_results.mean(0)
        std = df_results.std(0)
        df_stats = pd.DataFrame(index=['mean', 'std'],
                                columns=['acc', 'nmi'],
                                data=np.vstack([mean, std]))
        
        df_results = pd.concat([df_results, df_stats], axis=0)
        df_results.index.name = "N_components: " + title
        return df_results
    return benchmark

def benchmark_decorator_em_fs(func):
    def benchmark(X, y, comp, n_class, title, n_iterations):
        df_results = pd.DataFrame(index=range(n_iterations), columns=['acc', 'nmi'])
        
        for iteration in range(n_iterations):
            X_trans = func(X, y, comp)
            gmm = GMM(n_class)
            gmm.fit(X_trans)
            preds = gmm.predict(X_trans)
            acc = accuracy_score(preds, y)
            nmi = NMI(preds, y)
            
            df_results.loc[iteration, 'acc'] = acc
            df_results.loc[iteration, 'nmi'] = nmi
        
        mean = df_results.mean(0)
        std = df_results.std(0)
        df_stats = pd.DataFrame(index=['mean', 'std'],
                                columns=['acc', 'nmi'],
                                data=np.vstack([mean, std]))
        
        df_results = pd.concat([df_results, df_stats], axis=0)
        df_results.index.name = "N_components: " + title
        return df_results
    return benchmark

@benchmark_decorator_km
def perform_pca_km(X, comps, *args):
    transformer = PCA(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_km
def perform_ica_km(X, comps, *args):
    transformer = ICA(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_km
def perform_grp_km(X, comps, *args):
    transformer = GRP(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_km
def perform_tsne_km(X, comps):
    transformer = TSNE(n_components=2, init='pca', random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_km_fs
def perform_gus_km(X, y, comps):
    transformer = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=comps)
    return transformer.fit_transform(X, y)


@benchmark_decorator_em
def perform_pca_em(X, comps, *args):
    transformer = PCA(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_em
def perform_ica_em(X, comps, *args):
    transformer = ICA(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_em
def perform_grp_em(X, comps, *args):
    transformer = GRP(n_components=comps, random_state=7308)
    return transformer.fit_transform(X)


@benchmark_decorator_em_fs
def perform_gus_em(X, y, comps):
    transformer = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=comps)
    return transformer.fit_transform(X, y)

In [5]:
# Create KM benchmark data for student

save_path = "./assets/results_base/student"
iterations = 10
scaler = StandardScaler()
X_std, y = scaler.fit_transform(X_student), y_student

for comp in range(2, 16):
    df_pca = perform_pca_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_ica = perform_pca_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_grp = perform_grp_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_gus = perform_gus_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)

    df_pca.to_csv("{}/km_PCA/components_{}.csv".format(save_path, comp))
    df_ica.to_csv("{}/km_ICA/components_{}.csv".format(save_path, comp))
    df_grp.to_csv("{}/km_GRP/components_{}.csv".format(save_path, comp))
    df_gus.to_csv("{}/km_GUS/components_{}.csv".format(save_path, comp))

In [6]:
# Create KM benchmark data for housing

save_path = "./assets/results_base/housing"
iterations = 5
scaler = StandardScaler()
X_std, y = scaler.fit_transform(X_housing), y_housing

for comp in range(2, 25):
    df_pca = perform_pca_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_ica = perform_pca_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_grp = perform_grp_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_gus = perform_gus_km(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)

    df_pca.to_csv("{}/km_PCA/components_{}.csv".format(save_path, comp))
    df_ica.to_csv("{}/km_ICA/components_{}.csv".format(save_path, comp))
    df_grp.to_csv("{}/km_GRP/components_{}.csv".format(save_path, comp))
    df_gus.to_csv("{}/km_GUS/components_{}.csv".format(save_path, comp))

In [7]:
# Create EM benchmark data for student

save_path = "./assets/results_base/student"
iterations = 10
scaler = StandardScaler()
X_std, y = scaler.fit_transform(X_student), y_student

for comp in range(2, 16):
    df_pca = perform_pca_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_ica = perform_pca_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_grp = perform_grp_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_gus = perform_gus_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)

    df_pca.to_csv("{}/em_PCA/components_{}.csv".format(save_path, comp))
    df_ica.to_csv("{}/em_ICA/components_{}.csv".format(save_path, comp))
    df_grp.to_csv("{}/em_GRP/components_{}.csv".format(save_path, comp))
    df_gus.to_csv("{}/em_GUS/components_{}.csv".format(save_path, comp))

In [None]:
# Create EM benchmark data for housing

save_path = "./assets/results_base/housing"
iterations = 5
scaler = StandardScaler()
X_std, y = scaler.fit_transform(X_housing), y_housing

for comp in range(2, 25):
    df_pca = perform_pca_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_ica = perform_pca_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_grp = perform_grp_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)
    df_gus = perform_gus_em(X_std, y, comp, n_class=y.nunique(), title=str(comp), n_iterations=iterations)

    df_pca.to_csv("{}/em_PCA/components_{}.csv".format(save_path, comp))
    df_ica.to_csv("{}/em_ICA/components_{}.csv".format(save_path, comp))
    df_grp.to_csv("{}/em_GRP/components_{}.csv".format(save_path, comp))
    df_gus.to_csv("{}/em_GUS/components_{}.csv".format(save_path, comp))

In [None]:
# Evaluate km benchmark data for student

file_path = "./assets/results_base/student"
results = []
for base in glob.glob(file_path + "/km_*"):
    transformer = base.split("/")[-1]
    df_results = pd.DataFrame()
    for i,f in zip(range(2, 25), glob.glob("{}/{}/*.csv".format(file_path, transformer))):
        n_comps = f.split("/")[-1].split('.')[0].split('_')[-1]
        n_comps = n_comps.zfill(2)

        df_temp = pd.read_csv(f, usecols=['acc', 'nmi'])
        df_temp.rename(columns={'acc':'{}_acc'.format(n_comps), 'nmi':'{}_nmi'.format(n_comps)}, inplace=True)
        df_results = pd.concat([df_results, df_temp], axis=1)

    orig_cols = list(range(df_temp.shape[0]))
    orig_cols[-2] = 'mean'
    orig_cols[-1] = 'std'

    df_results.index = orig_cols
    df_results.index.name = transformer
    df_results = df_results.reindex(sorted(df_results.columns), axis=1)
    
    results.append(df_results)
    
highest_value = 0
highest_index = None
transformer = None

for result in results:
    accuracies = result.filter(regex='acc', axis=1)
    nmis = result.filter(regex='nmi', axis=1)
    
    highest_acc = accuracies.loc['mean', :].max()
    high_acc_index = accuracies.loc['mean', :].idxmax()
    high_acc_std = accuracies.loc['std', high_acc_index]

    highest_nmi = nmis.loc['mean', :].max()
    high_nmi_index = nmis.loc['mean', :].idxmax()
    high_nmi_std = nmis.loc['std', high_nmi_index]
    
    transformer_ = result.index.name
    
    print("{} achieved a highest accuracy of {:.4f}\u00B1{:.4f} with n_comps: {}"
      .format(transformer_, highest_acc, high_acc_std, high_acc_index.split("_")[0]))

    print("{} achieved a highest NMI of {:.4f}\u00B1{:.5f} with n_comps: {}"
      .format(transformer_, highest_nmi, high_nmi_std, high_nmi_index.split("_")[0]))
    print()

In [None]:
# Evaluate EM benchmark data for student
file_path = "./assets/results_base/student"
results = []
for base in glob.glob(file_path + "/em_*"):
    transformer = base.split("/")[-1]
    df_results = pd.DataFrame()
    for i,f in zip(range(2, 25), glob.glob("{}/{}/*.csv".format(file_path, transformer))):
        n_comps = f.split("/")[-1].split('.')[0].split('_')[-1]
        n_comps = n_comps.zfill(2)

        df_temp = pd.read_csv(f, usecols=['acc', 'nmi'])
        df_temp.rename(columns={'acc':'{}_acc'.format(n_comps), 'nmi':'{}_nmi'.format(n_comps)}, inplace=True)
        df_results = pd.concat([df_results, df_temp], axis=1)

    orig_cols = list(range(df_temp.shape[0]))
    orig_cols[-2] = 'mean'
    orig_cols[-1] = 'std'

    df_results.index = orig_cols
    df_results.index.name = transformer
    df_results = df_results.reindex(sorted(df_results.columns), axis=1)
    
    results.append(df_results)
    
highest_value = 0
highest_index = None
transformer = None

for result in results:
    accuracies = result.filter(regex='acc', axis=1)
    nmis = result.filter(regex='nmi', axis=1)
    
    highest_acc = accuracies.loc['mean', :].max()
    high_acc_index = accuracies.loc['mean', :].idxmax()
    high_acc_std = accuracies.loc['std', high_acc_index]

    highest_nmi = nmis.loc['mean', :].max()
    high_nmi_index = nmis.loc['mean', :].idxmax()
    high_nmi_std = nmis.loc['std', high_nmi_index]
    
    transformer_ = result.index.name
    
    print("{} achieved a highest accuracy of {:.4f}\u00B1{:.4f} with n_comps: {}"
      .format(transformer_, highest_acc, high_acc_std, high_acc_index.split("_")[0]))

    print("{} achieved a highest NMI of {:.4f}\u00B1{:.5f} with n_comps: {}"
      .format(transformer_, highest_nmi, high_nmi_std, high_nmi_index.split("_")[0]))
    print()

In [None]:
# Evaluate km benchmark data for housing
file_path = "./assets/results_base/housing"
results = []
for base in glob.glob(file_path + "/km_*"):
    transformer = base.split("/")[-1]
    df_results = pd.DataFrame()
    for i,f in zip(range(2, 25), glob.glob("{}/{}/*.csv".format(file_path, transformer))):
        n_comps = f.split("/")[-1].split('.')[0].split('_')[-1]
        n_comps = n_comps.zfill(2)

        df_temp = pd.read_csv(f, usecols=['acc', 'nmi'])
        df_temp.rename(columns={'acc':'{}_acc'.format(n_comps), 'nmi':'{}_nmi'.format(n_comps)}, inplace=True)
        df_results = pd.concat([df_results, df_temp], axis=1)

    orig_cols = list(range(df_temp.shape[0]))
    orig_cols[-2] = 'mean'
    orig_cols[-1] = 'std'

    df_results.index = orig_cols
    df_results.index.name = transformer
    df_results = df_results.reindex(sorted(df_results.columns), axis=1)
    
    results.append(df_results)

highest_value = 0
highest_index = None
transformer = None

for result in results:
    accuracies = result.filter(regex='acc', axis=1)
    nmis = result.filter(regex='nmi', axis=1)
    
    highest_acc = accuracies.loc['mean', :].max()
    high_acc_index = accuracies.loc['mean', :].idxmax()
    high_acc_std = accuracies.loc['std', high_acc_index]

    highest_nmi = nmis.loc['mean', :].max()
    high_nmi_index = nmis.loc['mean', :].idxmax()
    high_nmi_std = nmis.loc['std', high_nmi_index]
    
    transformer_ = result.index.name
    
    print("{} achieved a highest accuracy of {:.4f}\u00B1{:.4f} with n_comps: {}"
      .format(transformer_, highest_acc, high_acc_std, high_acc_index.split("_")[0]))

    print("{} achieved a highest NMI of {:.4f}\u00B1{:.5f} with n_comps: {}"
      .format(transformer_, highest_nmi, high_nmi_std, high_nmi_index.split("_")[0]))
    print()

In [None]:
# Evaluate EM benchmark data for housing
file_path = "./assets/results_base/housing"
results = []
for base in glob.glob(file_path + "/em_*"):
    transformer = base.split("/")[-1]
    df_results = pd.DataFrame()
    for i,f in zip(range(2, 25), glob.glob("{}/{}/*.csv".format(file_path, transformer))):
        n_comps = f.split("/")[-1].split('.')[0].split('_')[-1]
        n_comps = n_comps.zfill(2)

        df_temp = pd.read_csv(f, usecols=['acc', 'nmi'])
        df_temp.rename(columns={'acc':'{}_acc'.format(n_comps), 'nmi':'{}_nmi'.format(n_comps)}, inplace=True)
        df_results = pd.concat([df_results, df_temp], axis=1)

    orig_cols = list(range(df_temp.shape[0]))
    orig_cols[-2] = 'mean'
    orig_cols[-1] = 'std'

    df_results.index = orig_cols
    df_results.index.name = transformer
    df_results = df_results.reindex(sorted(df_results.columns), axis=1)
    
    results.append(df_results)

highest_value = 0
highest_index = None
transformer = None

for result in results:
    accuracies = result.filter(regex='acc', axis=1)
    nmis = result.filter(regex='nmi', axis=1)
    
    highest_acc = accuracies.loc['mean', :].max()
    high_acc_index = accuracies.loc['mean', :].idxmax()
    high_acc_std = accuracies.loc['std', high_acc_index]

    highest_nmi = nmis.loc['mean', :].max()
    high_nmi_index = nmis.loc['mean', :].idxmax()
    high_nmi_std = nmis.loc['std', high_nmi_index]
    
    transformer_ = result.index.name
    
    print("{} achieved a highest accuracy of {:.4f}\u00B1{:.4f} with n_comps: {}"
      .format(transformer_, highest_acc, high_acc_std, high_acc_index.split("_")[0]))

    print("{} achieved a highest NMI of {:.4f}\u00B1{:.5f} with n_comps: {}"
      .format(transformer_, highest_nmi, high_nmi_std, high_nmi_index.split("_")[0]))
    print()