In [5]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X = X.loc[:, X.columns[range(2, X.shape[1], 2)]]

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)

y_class2 = df['class2']
y_class4 = df['class4']

In [7]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

In [8]:
p = 0.7
variance = p * (1 - p)

In [9]:
dimension_reductions_y2 = [
    ('iso', Isomap(n_components=30)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=10)),
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=20))]

dimension_reductions_y4 = [
    ('iso', Isomap(n_components=10)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=5)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=10))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10))]

In [10]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
   
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

In [11]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

In [12]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data
    break
    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data
        break

In [13]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2


Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.829052,0.032567,0.840397,0.031526


In [14]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,1.0,1.0,1.0,1.0
mean,0.829052,0.032567,0.840397,0.031526
std,,,,
min,0.829052,0.032567,0.840397,0.031526
25%,0.829052,0.032567,0.840397,0.031526
50%,0.829052,0.032567,0.840397,0.031526
75%,0.829052,0.032567,0.840397,0.031526
max,0.829052,0.032567,0.840397,0.031526


In [15]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data
    break

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data
        break

In [16]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.614458,0.032233,0.627296,0.035211


In [17]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,1.0,1.0,1.0,1.0
mean,0.614458,0.032233,0.627296,0.035211
std,,,,
min,0.614458,0.032233,0.627296,0.035211
25%,0.614458,0.032233,0.627296,0.035211
50%,0.614458,0.032233,0.627296,0.035211
75%,0.614458,0.032233,0.627296,0.035211
max,0.614458,0.032233,0.627296,0.035211


In [19]:
df = pd.read_csv('npf_test_hidden.csv', index_col='id')
#filter data first, then save theminto a new csv for easier acces later

class2 = df['class4'].copy()
class2[class2 != 'nonevent'] = 'event'
df['class2'] = class2

df.drop(['date', 'partlybad'], axis=1, inplace=True)

In [28]:
df = pd.read_csv('npf_test_hidden.csv', index_col='id')
df

Unnamed: 0_level_0,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
459,,,False,376.610169,0.526617,376.660339,0.500203,376.934655,0.564360,376.634746,...,-2.851967,0.156421,-2.356066,0.174219,2.374336,1.318965,0.040709,0.024647,0.000644,0.000119
460,,,False,390.624932,0.453585,390.580694,0.455308,391.000685,0.505836,390.487945,...,-18.646384,0.448865,-18.030984,0.478829,3.850439,2.056031,0.103446,0.060432,0.003707,0.000115
461,,,False,375.280258,1.249087,375.479806,1.241525,375.624129,1.423506,375.571474,...,3.485822,1.526998,4.649689,1.978654,15.498463,11.001410,0.557332,0.494026,0.001637,0.000428
462,,,False,382.642176,3.222805,382.890412,3.026140,383.136941,3.466259,382.937706,...,6.933127,3.737176,7.657725,3.823123,16.909366,13.924594,0.646806,0.602040,0.002480,0.000510
463,,,False,381.492971,4.386929,381.608000,4.333558,382.177784,4.491875,381.588857,...,11.941411,2.195680,12.570801,2.458619,7.596930,6.722838,0.275559,0.312076,0.009429,0.004438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,,,False,387.945654,8.594281,387.195208,6.395089,390.564398,12.503105,386.868542,...,13.846229,1.908796,14.671475,2.278146,14.888953,13.891135,0.724499,0.741484,0.002978,0.000775
1420,,,False,381.204386,0.172730,381.193333,0.174513,381.289123,0.174594,381.174737,...,-5.539621,0.189930,-5.018333,0.192347,1.208696,0.588515,0.022685,0.011805,0.002506,0.000101
1421,,,False,383.915986,2.407693,383.944965,2.349077,384.052183,2.456562,383.862958,...,6.970141,2.270736,7.698230,2.916700,16.280512,10.828638,0.572551,0.464658,0.006474,0.000857
1422,,,False,379.521641,1.199564,379.525194,1.147027,379.778906,1.257693,379.473203,...,1.643628,2.549993,2.697919,2.916053,15.014020,8.972121,0.546366,0.416863,0.000829,0.000227


In [29]:
df = pd.read_csv('npf_test_hidden.csv', index_col='id')
df=df.drop(columns=['date', 'class4', 'partlybad'])

In [30]:
df

Unnamed: 0_level_0,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,Glob.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
459,376.610169,0.526617,376.660339,0.500203,376.934655,0.564360,376.634746,0.471236,37.115592,24.180794,...,-2.851967,0.156421,-2.356066,0.174219,2.374336,1.318965,0.040709,0.024647,0.000644,0.000119
460,390.624932,0.453585,390.580694,0.455308,391.000685,0.505836,390.487945,0.481292,69.134531,59.895057,...,-18.646384,0.448865,-18.030984,0.478829,3.850439,2.056031,0.103446,0.060432,0.003707,0.000115
461,375.280258,1.249087,375.479806,1.241525,375.624129,1.423506,375.571474,1.200556,276.485371,201.722672,...,3.485822,1.526998,4.649689,1.978654,15.498463,11.001410,0.557332,0.494026,0.001637,0.000428
462,382.642176,3.222805,382.890412,3.026140,383.136941,3.466259,382.937706,2.837540,308.904304,287.444652,...,6.933127,3.737176,7.657725,3.823123,16.909366,13.924594,0.646806,0.602040,0.002480,0.000510
463,381.492971,4.386929,381.608000,4.333558,382.177784,4.491875,381.588857,4.263299,111.456879,104.807009,...,11.941411,2.195680,12.570801,2.458619,7.596930,6.722838,0.275559,0.312076,0.009429,0.004438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,387.945654,8.594281,387.195208,6.395089,390.564398,12.503105,386.868542,5.561891,225.755653,245.560692,...,13.846229,1.908796,14.671475,2.278146,14.888953,13.891135,0.724499,0.741484,0.002978,0.000775
1420,381.204386,0.172730,381.193333,0.174513,381.289123,0.174594,381.174737,0.179355,12.633440,6.528276,...,-5.539621,0.189930,-5.018333,0.192347,1.208696,0.588515,0.022685,0.011805,0.002506,0.000101
1421,383.915986,2.407693,383.944965,2.349077,384.052183,2.456562,383.862958,2.212343,312.786155,206.675746,...,6.970141,2.270736,7.698230,2.916700,16.280512,10.828638,0.572551,0.464658,0.006474,0.000857
1422,379.521641,1.199564,379.525194,1.147027,379.778906,1.257693,379.473203,1.078942,315.506008,166.933151,...,1.643628,2.549993,2.697919,2.916053,15.014020,8.972121,0.546366,0.416863,0.000829,0.000227
