In [3]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv('train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X = X.loc[:, X.columns[range(2, X.shape[1], 2)]]

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)

y_class2 = df['class2']
y_class4 = df['class4']

In [6]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

In [7]:
p = 0.7
variance = p * (1 - p)

In [8]:
dimension_reductions_y2 = [
    ('iso', Isomap(n_components=30)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=10)),
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=20))]

dimension_reductions_y4 = [
    ('iso', Isomap(n_components=10)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=5)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=10))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10))]

In [11]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
   
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

In [12]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

In [14]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data
    break
    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data
        break

In [15]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2


Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.829052,0.032567,0.840397,0.031526
logistic_iso,0.761407,0.048968,0.836065,0.037499
logistic_lle,0.749572,0.040934,0.821847,0.037107
logistic_llemodified,0.837365,0.035189,0.836464,0.034385
logistic_svd,0.831448,0.030128,0.836897,0.033532
logistic_lda,0.81026,0.035057,0.81026,0.035057
logistic_pca,0.834288,0.032526,0.840397,0.031526
logistic_kpca,0.497802,0.002692,0.684634,0.055254
logistic_sel,0.829489,0.03159,0.840397,0.031526
logistic_kbest,0.77796,0.036783,0.803698,0.038598


In [16]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,29.0,29.0,29.0,29.0
mean,0.775814,0.03424,0.818464,0.035684
std,0.082357,0.009667,0.035651,0.004779
min,0.497802,0.002692,0.684634,0.030155
25%,0.761407,0.032887,0.803705,0.032608
50%,0.793065,0.035005,0.836046,0.034511
75%,0.821839,0.039871,0.837797,0.037966
max,0.837365,0.048968,0.846519,0.055254


In [34]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data
    break

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data
        break

In [35]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.614458,0.032233,0.627296,0.035211
logistic_iso,0.548497,0.040611,0.647246,0.032124
kNeighbour,0.591746,0.037752,0.64741,0.039943
kNeighbour_iso,0.542807,0.045137,0.646271,0.039556
svcLinear,0.626658,0.031396,0.634307,0.031839
logistic_lle,0.5,0.003475,0.512664,0.015332


In [31]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,5.0,5.0,5.0,5.0
mean,0.584833,0.037426,0.640506,0.035735
std,0.037951,0.005767,0.00921,0.003898
min,0.542807,0.031396,0.627296,0.031839
25%,0.548497,0.032233,0.634307,0.032124
50%,0.591746,0.037752,0.646271,0.035211
75%,0.614458,0.040611,0.647246,0.039556
max,0.626658,0.045137,0.64741,0.039943
