In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from natsort import natsorted
import tsfresh as tf
import sklearn

In [21]:
pd.set_option('max_columns', None)

### Data loading

In [22]:
import os
import glob

dataset_path = 'data_feats_90_c/'
csv_files = glob.glob(dataset_path+'*.csv')

all_df = []

for filename in csv_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    all_df.append(df)

df = pd.concat(all_df, axis=0, ignore_index=True)
df.head(1)

Unnamed: 0.1,Unnamed: 0,P-PDG__mean,P-PDG__variance,P-PDG__skewness,P-PDG__kurtosis,"P-PDG__fft_aggregated__aggtype_""centroid""","P-PDG__fft_aggregated__aggtype_""variance""","P-PDG__fft_aggregated__aggtype_""skew""","P-PDG__fft_aggregated__aggtype_""kurtosis""",P-PDG__maximum,P-PDG__minimum,P-PDG__median,P-PDG__quantile__q_0.1,P-PDG__quantile__q_0.2,P-PDG__quantile__q_0.3,P-PDG__quantile__q_0.4,P-PDG__quantile__q_0.6,P-PDG__quantile__q_0.7,P-PDG__quantile__q_0.8,P-PDG__quantile__q_0.9,P-PDG__variation_coefficient,P-PDG__mean_change,P-PDG__mean_second_derivative_central,P-PDG__friedrich_coefficients__coeff_1__m_3__r_30,P-PDG__friedrich_coefficients__coeff_3__m_3__r_30,P-TPT__mean,P-TPT__variance,P-TPT__skewness,P-TPT__kurtosis,"P-TPT__fft_aggregated__aggtype_""centroid""","P-TPT__fft_aggregated__aggtype_""variance""","P-TPT__fft_aggregated__aggtype_""skew""","P-TPT__fft_aggregated__aggtype_""kurtosis""",P-TPT__maximum,P-TPT__minimum,P-TPT__median,P-TPT__quantile__q_0.1,P-TPT__quantile__q_0.2,P-TPT__quantile__q_0.3,P-TPT__quantile__q_0.4,P-TPT__quantile__q_0.6,P-TPT__quantile__q_0.7,P-TPT__quantile__q_0.8,P-TPT__quantile__q_0.9,P-TPT__variation_coefficient,P-TPT__mean_change,P-TPT__mean_second_derivative_central,P-TPT__friedrich_coefficients__coeff_1__m_3__r_30,P-TPT__friedrich_coefficients__coeff_3__m_3__r_30,T-TPT__mean,T-TPT__variance,T-TPT__skewness,T-TPT__kurtosis,"T-TPT__fft_aggregated__aggtype_""centroid""","T-TPT__fft_aggregated__aggtype_""variance""","T-TPT__fft_aggregated__aggtype_""skew""","T-TPT__fft_aggregated__aggtype_""kurtosis""",T-TPT__maximum,T-TPT__minimum,T-TPT__median,T-TPT__quantile__q_0.1,T-TPT__quantile__q_0.2,T-TPT__quantile__q_0.3,T-TPT__quantile__q_0.4,T-TPT__quantile__q_0.6,T-TPT__quantile__q_0.7,T-TPT__quantile__q_0.8,T-TPT__quantile__q_0.9,T-TPT__variation_coefficient,T-TPT__mean_change,T-TPT__mean_second_derivative_central,T-TPT__friedrich_coefficients__coeff_1__m_3__r_30,T-TPT__friedrich_coefficients__coeff_3__m_3__r_30,P-MON-CKP__mean,P-MON-CKP__variance,P-MON-CKP__skewness,P-MON-CKP__kurtosis,"P-MON-CKP__fft_aggregated__aggtype_""centroid""","P-MON-CKP__fft_aggregated__aggtype_""variance""","P-MON-CKP__fft_aggregated__aggtype_""skew""","P-MON-CKP__fft_aggregated__aggtype_""kurtosis""",P-MON-CKP__maximum,P-MON-CKP__minimum,P-MON-CKP__median,P-MON-CKP__quantile__q_0.1,P-MON-CKP__quantile__q_0.2,P-MON-CKP__quantile__q_0.3,P-MON-CKP__quantile__q_0.4,P-MON-CKP__quantile__q_0.6,P-MON-CKP__quantile__q_0.7,P-MON-CKP__quantile__q_0.8,P-MON-CKP__quantile__q_0.9,P-MON-CKP__variation_coefficient,P-MON-CKP__mean_change,P-MON-CKP__mean_second_derivative_central,P-MON-CKP__friedrich_coefficients__coeff_1__m_3__r_30,P-MON-CKP__friedrich_coefficients__coeff_3__m_3__r_30,T-JUS-CKP__mean,T-JUS-CKP__variance,T-JUS-CKP__skewness,T-JUS-CKP__kurtosis,"T-JUS-CKP__fft_aggregated__aggtype_""centroid""","T-JUS-CKP__fft_aggregated__aggtype_""variance""","T-JUS-CKP__fft_aggregated__aggtype_""skew""","T-JUS-CKP__fft_aggregated__aggtype_""kurtosis""",T-JUS-CKP__maximum,T-JUS-CKP__minimum,T-JUS-CKP__median,T-JUS-CKP__quantile__q_0.1,T-JUS-CKP__quantile__q_0.2,T-JUS-CKP__quantile__q_0.3,T-JUS-CKP__quantile__q_0.4,T-JUS-CKP__quantile__q_0.6,T-JUS-CKP__quantile__q_0.7,T-JUS-CKP__quantile__q_0.8,T-JUS-CKP__quantile__q_0.9,T-JUS-CKP__variation_coefficient,T-JUS-CKP__mean_change,T-JUS-CKP__mean_second_derivative_central,T-JUS-CKP__friedrich_coefficients__coeff_1__m_3__r_30,T-JUS-CKP__friedrich_coefficients__coeff_3__m_3__r_30,QGL__mean,QGL__variance,QGL__skewness,QGL__kurtosis,"QGL__fft_aggregated__aggtype_""centroid""","QGL__fft_aggregated__aggtype_""variance""","QGL__fft_aggregated__aggtype_""skew""","QGL__fft_aggregated__aggtype_""kurtosis""",QGL__maximum,QGL__minimum,QGL__median,QGL__quantile__q_0.1,QGL__quantile__q_0.2,QGL__quantile__q_0.3,QGL__quantile__q_0.4,QGL__quantile__q_0.6,QGL__quantile__q_0.7,QGL__quantile__q_0.8,QGL__quantile__q_0.9,QGL__variation_coefficient,QGL__mean_change,QGL__mean_second_derivative_central,QGL__friedrich_coefficients__coeff_1__m_3__r_30,QGL__friedrich_coefficients__coeff_3__m_3__r_30,class_code
0,33000,30452520.0,39.064321,-0.029904,-1.707532,1.4e-05,0.000532,,,30452532.0,30452512.0,30452523.0,30452514.0,30452515.0,30452516.0,30452518.0,30452526.0,30452527.3,30452528.0,30452529.0,2.052423e-07,-0.168539,0.0,,,21453280.0,12899.168025,0.19382,-0.722604,0.000525,0.015802,,,21453566.0,21453039.0,21453275.0,21453131.0,21453169.0,21453199.7,21453229.6,21453331.0,21453348.3,21453372.2,21453425.2,5e-06,3.022472,1.579545,-7.3e-05,33407980000.0,125.35756,7.086469e-09,0.054108,-0.473179,4.3e-05,0.001215,,,125.35778,125.35736,125.35756,125.35745,125.35748,125.357517,125.357536,125.35758,125.35761,125.357632,125.357661,6.715285e-07,-7.865169e-07,6.25e-07,,,1519597.66,3.915511,-0.008192,-1.022432,3.9e-05,0.000959,,,1519601.0,1519594.0,1519597.7,1519595.0,1519595.98,1519596.37,1519597.0,1519598.0,1519599.0,1519599.82,1519600.31,1e-06,-0.078652,0.0,,,92.644159,2e-06,0.059343,-1.201477,0.000423,0.010458,,,92.646534,92.641924,92.64413,92.642336,92.64277,92.643215,92.64367,92.644594,92.645066,92.645547,92.646035,1.5e-05,-5.2e-05,6.25e-08,-0.116989,1004.144079,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,5


In [23]:
df.drop(['Unnamed: 0'], axis='columns', inplace=True)      #leftover index column, unnecessary

df = df[df.class_code != 7]                                #paper does not use event class 7
df['class_code'].replace({8: 7}, inplace=True)


### Handling NA values

In [24]:
def print_na_sum(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        nans = df.isna().sum(axis=0)
        print(nans[nans!=0])

print_na_sum(df)

P-PDG__fft_aggregated__aggtype_"centroid"                 8571
P-PDG__fft_aggregated__aggtype_"variance"                 8571
P-PDG__fft_aggregated__aggtype_"skew"                    37244
P-PDG__fft_aggregated__aggtype_"kurtosis"                37244
P-PDG__variation_coefficient                              8571
P-PDG__friedrich_coefficients__coeff_1__m_3__r_30        20675
P-PDG__friedrich_coefficients__coeff_3__m_3__r_30        20675
P-TPT__fft_aggregated__aggtype_"centroid"                   64
P-TPT__fft_aggregated__aggtype_"variance"                   64
P-TPT__fft_aggregated__aggtype_"skew"                    30450
P-TPT__fft_aggregated__aggtype_"kurtosis"                30450
P-TPT__variation_coefficient                                64
P-TPT__friedrich_coefficients__coeff_1__m_3__r_30        12623
P-TPT__friedrich_coefficients__coeff_3__m_3__r_30        12623
T-TPT__fft_aggregated__aggtype_"centroid"                 6514
T-TPT__fft_aggregated__aggtype_"variance"              


Removing features with too many NA values

In [25]:
df.drop(['QGL__variation_coefficient'], axis='columns', inplace=True)
df.drop(list(df.filter(regex = 'friedrich')), axis = 1, inplace = True)
df.drop(list(df.filter(regex = 'fft_aggregated')), axis = 1, inplace = True)

In [26]:
print_na_sum(df)

P-PDG__variation_coefficient        8571
P-TPT__variation_coefficient          64
T-TPT__variation_coefficient        6514
P-MON-CKP__variation_coefficient    1200
T-JUS-CKP__variation_coefficient    1819
dtype: int64


Imputing leftover na values, for each class separately

In [27]:
imputed_df_list = []
for i in df['class_code'].unique():
    query = 'class_code == ' + str(i)
    imputed = tf.utilities.dataframe_functions.impute(df.query(query))
    imputed_df_list.append(imputed)
imputed_df = pd.concat(imputed_df_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._where(cond, other, inplace, axis, level, errors=errors)


In [28]:
print_na_sum(imputed_df)

Series([], dtype: int64)


In [29]:
imputed_df = imputed_df.sample(frac=1).reset_index(drop=True)
sample_df = imputed_df#.sample(frac=0.01, random_state=42)
X = sample_df.iloc[:,:-1].to_numpy()
y = sample_df.iloc[:,-1].to_numpy()
sample_df.shape

(51764, 108)

In [30]:
X

array([[ 2.61855458e+07,  3.45023947e+08, -6.56738512e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.55766993e+07,  2.46943281e+05,  1.07613392e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 2.86204675e+07,  1.35131556e+03,  1.58111161e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.43568481e+07,  1.70793446e+02, -2.98088327e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 3.36992529e+07,  2.53033840e+03, -1.40385069e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [31]:
y

array([1, 0, 0, ..., 5, 6, 7])

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
from sklearn.model_selection import KFold   

#todo: include window size into grid search

window_size_hp = [300, 600, 900]
regularization_hp = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

In [34]:
from sklearn.model_selection import KFold   
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

kf = KFold(n_splits=5)

### Logistic regression, SVM

In [35]:
from sklearn.linear_model import SGDClassifier

for clf in ['log', 'hinge']:
    if clf == 'log': 
        print('Logistic regression')
    if clf == 'hinge':
        print('Support vector machine')
    for r in regularization_hp:
        i = 0
        for train_index, test_index in kf.split(X_train):
            X_k_train, X_k_test = X_train[train_index], X_train[test_index]
            y_k_train, y_k_test = y_train[train_index], y_train[test_index]
            scaler = StandardScaler().fit(X_k_train)
            X_k_train = scaler.transform(X_k_train)
            X_k_test = scaler.transform(X_k_test)
            log_reg = SGDClassifier(loss='log', max_iter=10000, alpha=r).fit(X_k_train, y_k_train)
            h = log_reg.predict(X_k_test)
            X_test_scaled = scaler.transform(X_test)
            h_test = log_reg.predict(X_test_scaled)
            print('r='+str(r)+', i-th fold='+str(i)+', k-fold f1 score='+str(f1_score(y_k_test, h, average='macro'))+
                 ' , test f1 score=' + str(f1_score(y_test, h_test, average='macro')))
            i += 1
    print(confusion_matrix(y_test, h_test, normalize='true'))

Logistic regression
r=1e-07, i-th fold=0, k-fold f1 score=0.8678378229006349 , test f1 score=0.8582454936726966
r=1e-07, i-th fold=1, k-fold f1 score=0.759098292973039 , test f1 score=0.755358856651479
r=1e-07, i-th fold=2, k-fold f1 score=0.8214627267999598 , test f1 score=0.8214611143150967
r=1e-07, i-th fold=3, k-fold f1 score=0.8687652602167217 , test f1 score=0.8638113734955224
r=1e-07, i-th fold=4, k-fold f1 score=0.8473367523302722 , test f1 score=0.8443225854131544
r=1e-06, i-th fold=0, k-fold f1 score=0.85774363727482 , test f1 score=0.8459651254640032
r=1e-06, i-th fold=1, k-fold f1 score=0.7912519956390462 , test f1 score=0.7878135134110996
r=1e-06, i-th fold=2, k-fold f1 score=0.827149138630988 , test f1 score=0.8238380013534354
r=1e-06, i-th fold=3, k-fold f1 score=0.8575907955249746 , test f1 score=0.8463105285132797
r=1e-06, i-th fold=4, k-fold f1 score=0.8306391167423167 , test f1 score=0.8230884234725191
r=1e-05, i-th fold=0, k-fold f1 score=0.8697184764058766 , test f

### Decision trees

In [36]:
from sklearn.tree import DecisionTreeClassifier

complexity_hp = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
for ccp in complexity_hp:
    i = 0
    for train_index, test_index in kf.split(X_train):
        X_k_train, X_k_test = X_train[train_index], X_train[test_index]
        y_k_train, y_k_test = y_train[train_index], y_train[test_index]
        scaler = StandardScaler().fit(X_k_train)
        X_k_train = scaler.transform(X_k_train)
        X_k_test = scaler.transform(X_k_test)
        tree = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp).fit(X_k_train, y_k_train)
        h = tree.predict(X_k_test)
        X_test_scaled = scaler.transform(X_test)
        h_test = tree.predict(X_test_scaled)
        print('ccp='+str(ccp)+', i-th fold='+str(i)+', k-fold f1 score='+str(f1_score(y_k_test, h, average='macro'))+
            ' , test f1 score=' + str(f1_score(y_test, h_test, average='macro')))
        i += 1
        
confusion_matrix(y_test, h_test, normalize='true')

ccp=0.1, i-th fold=0, k-fold f1 score=0.41646678260000447 , test f1 score=0.4178123694895498
ccp=0.1, i-th fold=1, k-fold f1 score=0.41917845895316624 , test f1 score=0.4178446632863148
ccp=0.1, i-th fold=2, k-fold f1 score=0.4183771396317813 , test f1 score=0.4172179517400608
ccp=0.1, i-th fold=3, k-fold f1 score=0.4167903619114125 , test f1 score=0.4172157919320737
ccp=0.1, i-th fold=4, k-fold f1 score=0.42878883474010304 , test f1 score=0.4261656446745463
ccp=0.01, i-th fold=0, k-fold f1 score=0.9165913962981282 , test f1 score=0.9081017210213513
ccp=0.01, i-th fold=1, k-fold f1 score=0.8919620211705876 , test f1 score=0.8834413796504774
ccp=0.01, i-th fold=2, k-fold f1 score=0.7631186709693819 , test f1 score=0.7649840529378703
ccp=0.01, i-th fold=3, k-fold f1 score=0.8839292874267094 , test f1 score=0.8829128482850073
ccp=0.01, i-th fold=4, k-fold f1 score=0.8936981395758925 , test f1 score=0.8840228458730064
ccp=0.001, i-th fold=0, k-fold f1 score=0.966298315053256 , test f1 scor

array([[9.97159987e-01, 0.00000000e+00, 6.31113916e-04, 0.00000000e+00,
        1.89334175e-03, 3.15556958e-04, 0.00000000e+00, 0.00000000e+00],
       [3.66703337e-04, 9.97066373e-01, 3.66703337e-04, 2.20022002e-03,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.50000000e-02, 0.00000000e+00, 9.10000000e-01, 1.00000000e-02,
        0.00000000e+00, 1.50000000e-02, 0.00000000e+00, 5.00000000e-02],
       [0.00000000e+00, 2.47371676e-03, 2.47371676e-03, 9.80828695e-01,
        6.18429190e-03, 8.03957947e-03, 0.00000000e+00, 0.00000000e+00],
       [2.22804718e-02, 0.00000000e+00, 0.00000000e+00, 1.17955439e-02,
        9.64613368e-01, 1.31061599e-03, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.29200092e-03,
        0.00000000e+00, 9.97707999e-01, 0.00000000e+00, 0.00000000e+00],
       [5.19750520e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 9.99480249e-01, 0.

### Random forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

n_trees_hp = [50, 100, 150, 175]
max_depth_hp = [5, 7, 10, None]
n_feats_at_splits_hp = [5, 10, 15]   #pitati za komentar o ovom hiperparametru

for n_trees in n_trees_hp:
    for max_depth in max_depth_hp:
        for n_feats_at_splits in n_feats_at_splits_hp:
            i = 0
            for train_index, test_index in kf.split(X_train):
                X_k_train, X_k_test = X_train[train_index], X_train[test_index]
                y_k_train, y_k_test = y_train[train_index], y_train[test_index]
                scaler = StandardScaler().fit(X_k_train)
                X_k_train = scaler.transform(X_k_train)
                X_k_test = scaler.transform(X_k_test)
                trees = RandomForestClassifier(random_state=42, n_estimators=n_trees,
                                              max_depth=max_depth, min_samples_split=n_feats_at_splits)
                trees.fit(X_k_train, y_k_train)
                h = trees.predict(X_k_test)
                X_test_scaled = scaler.transform(X_test)
                h_test = trees.predict(X_test_scaled)
                print('n_feats_at_split=' + str(n_feats_at_splits) +', n_trees=' + str(n_trees) + 
                'max depth:'+ str(max_depth) + ', i-th fold='+str(i)+', k-fold f1 score='+
                str(f1_score(y_k_test, h, average='macro'))+
                ' , test f1 score=',str(f1_score(y_test, h_test, average='macro')))
                i += 1
                
confusion_matrix(y_test, h_test, normalize='true')

n_feats_at_split=5, n_trees=50max depth:5, i-th fold=0, k-fold f1 score=0.8491531891672546 , test f1 score= 0.8351855550278393
n_feats_at_split=5, n_trees=50max depth:5, i-th fold=1, k-fold f1 score=0.8766968655355323 , test f1 score= 0.8691373792944275
n_feats_at_split=5, n_trees=50max depth:5, i-th fold=2, k-fold f1 score=0.8566524942964346 , test f1 score= 0.8365403474412018
n_feats_at_split=5, n_trees=50max depth:5, i-th fold=3, k-fold f1 score=0.9159757542866027 , test f1 score= 0.9134607677531563
n_feats_at_split=5, n_trees=50max depth:5, i-th fold=4, k-fold f1 score=0.8348489516164773 , test f1 score= 0.8308757873517293
n_feats_at_split=10, n_trees=50max depth:5, i-th fold=0, k-fold f1 score=0.8552478419311007 , test f1 score= 0.8429991750974793
n_feats_at_split=10, n_trees=50max depth:5, i-th fold=1, k-fold f1 score=0.8454101678020765 , test f1 score= 0.8351047126813532
n_feats_at_split=10, n_trees=50max depth:5, i-th fold=2, k-fold f1 score=0.8532034157493635 , test f1 score= 

KeyboardInterrupt: 

### Linear Discriminant Analysis, Quadratic Discriminant Analysis

In [38]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
    
print('LDA')
i = 0
for train_index, test_index in kf.split(X_train):
    X_k_train, X_k_test = X_train[train_index], X_train[test_index]
    y_k_train, y_k_test = y_train[train_index], y_train[test_index]
    scaler = StandardScaler().fit(X_k_train)
    X_k_train = scaler.transform(X_k_train)
    X_k_test = scaler.transform(X_k_test)
    clf = LinearDiscriminantAnalysis().fit(X_k_train, y_k_train)
    h = clf.predict(X_k_test)
    X_test_scaled = scaler.transform(X_test)
    h_test = clf.predict(X_test_scaled)
    print('i-th fold='+str(i)+', k-fold f1 score='+str(f1_score(y_k_test, h, average='macro'))+
         ' , test f1 score=' + str(f1_score(y_test, h_test, average='macro')))
    i += 1
    
print(confusion_matrix(y_test, h_test, normalize='true'))

print('\nQDA')
i = 0
for train_index, test_index in kf.split(X_train):
    X_k_train, X_k_test = X_train[train_index], X_train[test_index]
    y_k_train, y_k_test = y_train[train_index], y_train[test_index]
    scaler = StandardScaler().fit(X_k_train)
    X_k_train = scaler.transform(X_k_train)
    X_k_test = scaler.transform(X_k_test)
    clf = QuadraticDiscriminantAnalysis().fit(X_k_train, y_k_train)
    h = clf.predict(X_k_test)
    X_test_scaled = scaler.transform(X_test)
    h_test = clf.predict(X_test_scaled)
    print('i-th fold='+str(i)+', k-fold f1 score='+str(f1_score(y_k_test, h, average='macro'))+
         ' , test f1 score=' + str(f1_score(y_test, h_test, average='macro')))
    i += 1
confusion_matrix(y_test, h_test, normalize='true')

LDA
i-th fold=0, k-fold f1 score=0.7787275927106323 , test f1 score=0.759634682972167
i-th fold=1, k-fold f1 score=0.7742538745786314 , test f1 score=0.7596077133937903
i-th fold=2, k-fold f1 score=0.7812390628347605 , test f1 score=0.7596838828028907
i-th fold=3, k-fold f1 score=0.7621340867399615 , test f1 score=0.7590816097564002
i-th fold=4, k-fold f1 score=0.7703531954565328 , test f1 score=0.760842225233435
[[8.12559167e-01 2.20889871e-03 0.00000000e+00 0.00000000e+00
  1.57778479e-03 1.83654150e-01 0.00000000e+00 0.00000000e+00]
 [3.66703337e-04 9.76530986e-01 0.00000000e+00 7.33406674e-04
  0.00000000e+00 2.23689036e-02 0.00000000e+00 0.00000000e+00]
 [9.00000000e-02 0.00000000e+00 1.85000000e-01 1.00000000e-02
  0.00000000e+00 1.10000000e-01 5.05000000e-01 1.00000000e-01]
 [4.32900433e-03 3.03030303e-02 4.94743352e-03 7.60049474e-01
  3.58688930e-02 1.63265306e-01 6.18429190e-04 6.18429190e-04]
 [2.49017038e-02 3.40760157e-02 0.00000000e+00 2.22804718e-02
  8.04718218e-01 1.14



i-th fold=0, k-fold f1 score=0.5759776279207434 , test f1 score=0.5795339106771454




i-th fold=1, k-fold f1 score=0.5479597158786123 , test f1 score=0.5342029913344952




i-th fold=2, k-fold f1 score=0.5763918818697709 , test f1 score=0.5743412195841953




i-th fold=3, k-fold f1 score=0.5341410440956926 , test f1 score=0.5364611238497836




i-th fold=4, k-fold f1 score=0.5486902391121956 , test f1 score=0.5495422343240938


array([[5.57273588e-01, 3.18396971e-01, 1.19280530e-01, 2.20889871e-03,
        2.84001262e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.46681335e-03, 9.88265493e-01, 3.66703337e-03, 6.23395673e-03,
        0.00000000e+00, 3.66703337e-04, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.15000000e-01, 8.15000000e-01, 5.00000000e-02,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.00000000e-02],
       [0.00000000e+00, 1.64502165e-01, 4.14347557e-02, 7.67470625e-01,
        5.56586271e-03, 2.10265925e-02, 0.00000000e+00, 0.00000000e+00],
       [3.49934469e-01, 6.94626474e-02, 0.00000000e+00, 1.11402359e-01,
        4.69200524e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.07724043e-02, 8.46435939e-01, 6.71556269e-02, 3.69012148e-02,
        2.29200092e-04, 3.82764153e-02, 0.00000000e+00, 2.29200092e-04],
       [1.03950104e-03, 1.24740125e-02, 1.03950104e-03, 2.07900208e-03,
        0.00000000e+00, 0.00000000e+00, 9.83367983e-01, 0.

In [43]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

n_trees_hp = [250, 400, 100, 550]
max_depth_hp = [3, 1, 5]
lr_hp = [0.1] #001 01 1

for max_depth in max_depth_hp:
    for n_trees in n_trees_hp:
        for lr in lr_hp:
            i = 0
            for train_index, test_index in kf.split(X_train):
                X_k_train, X_k_test = X_train[train_index], X_train[test_index]
                y_k_train, y_k_test = y_train[train_index], y_train[test_index]
                scaler = StandardScaler().fit(X_k_train)
                X_k_train = scaler.transform(X_k_train)
                X_k_test = scaler.transform(X_k_test)
                trees = AdaBoostClassifier(random_state=42, n_estimators=n_trees,
                                            base_estimator=DecisionTreeClassifier(max_depth=max_depth))
                trees.fit(X_k_train, y_k_train)
                h = trees.predict(X_k_test)
                X_test_scaled = scaler.transform(X_test)
                h_test = trees.predict(X_test_scaled)
                print('n_trees=' + str(n_trees) +', max_depth=' + str(max_depth) + 
                      ', lr=' + str(lr) + ', i-th fold='+str(i)+', k-fold f1 score='+
                      str(f1_score(y_k_test, h, average='macro'))+
                    ' , test f1 score=',str(f1_score(y_test, h_test, average='macro')))
                i += 1

n_trees=250, max_depth=3, lr=0.1, i-th fold=0, k-fold f1 score=0.9419676028804602 , test f1 score= 0.9386927718626493
n_trees=250, max_depth=3, lr=0.1, i-th fold=1, k-fold f1 score=0.954091299302045 , test f1 score= 0.9466078828214144
n_trees=250, max_depth=3, lr=0.1, i-th fold=2, k-fold f1 score=0.9526030867182935 , test f1 score= 0.9570322124331697
n_trees=250, max_depth=3, lr=0.1, i-th fold=3, k-fold f1 score=0.955484499299679 , test f1 score= 0.9505746860347879
n_trees=250, max_depth=3, lr=0.1, i-th fold=4, k-fold f1 score=0.938136652237717 , test f1 score= 0.9343262286646508


KeyboardInterrupt: 

In [None]:
n_trees=100, max_depth=3, lr=0.01, i-th fold=0, k-fold f1 score=0.9098299746638029 , test f1 score= 0.9090440173723331
n_trees=100, max_depth=3, lr=0.01, i-th fold=1, k-fold f1 score=0.9200426525551122 , test f1 score= 0.9158788102994806
n_trees=100, max_depth=3, lr=0.01, i-th fold=2, k-fold f1 score=0.9509625266945874 , test f1 score= 0.9491826272735017
n_trees=100, max_depth=3, lr=0.01, i-th fold=3, k-fold f1 score=0.9285494129988294 , test f1 score= 0.9252001147201362
n_trees=100, max_depth=3, lr=0.01, i-th fold=4, k-fold f1 score=0.8813602357765438 , test f1 score= 0.8791765773337183
n_trees=100, max_depth=3, lr=0.1, i-th fold=0, k-fold f1 score=0.9098299746638029 , test f1 score= 0.9090440173723331
n_trees=100, max_depth=3, lr=0.1, i-th fold=1, k-fold f1 score=0.9200426525551122 , test f1 score= 0.9158788102994806