In [1]:
import warnings
warnings.filterwarnings("ignore")
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#plt.style.use('fivethirtyeight')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,f1_score,roc_curve, auc,roc_auc_score,precision_score,recall_score,matthews_corrcoef


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif,f_classif
np.set_printoptions(precision=3)


In [2]:
def feature_ranking_selection(X_train, y_train, n_features):
    '''
    n_features: number of feature to select for training
    
    returns: feature name with pearson correlation coefficient(in descending) and selected n_features
    
    '''
    
    df = X_train.copy()
    df['label'] = y_train.values    
    correlation_mat = df.corr(method = 'pearson')    
    feature_name = list(correlation_mat.index)   
    
    ndf = pd.DataFrame()
    ndf['feature'] = feature_name
    ndf ['importance'] = abs((correlation_mat.iloc[:,-1]).values )
    
    mdf = ndf[:-1]    
    mdf = (mdf.sort_values(by='importance', ascending=False)).reset_index(drop = True)
    
    if n_features > len(mdf):
        print('Number features to select is too large.')
        return mdf
    else:        
        selected_feature = list((mdf.iloc[0:n_features])['feature'].values)        
        return mdf, selected_feature        

# 1. CLR  202

In [3]:
#Load dataset as pandas data frame
filename = 'CLR_both_202.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')   

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0            t_std_Gx    0.373362
1   t_Crest_factor_Gy    0.313880
2   t_Crest_factor_Gz    0.275701
3           t_mean_Az    0.225354
4       t_kurtosis_Ay    0.198155
5       t_kurtosis_Ax    0.171519
6       t_kurtosis_Az    0.159185
7           t_mean_Ay    0.156218
8             f_Q1_Ay    0.134174
9       t_skewness_Ay    0.105127
10      t_kurtosis_Gy    0.101014
11          t_mean_Ax    0.063295
12      t_skewness_Gx    0.059554
13  t_Crest_factor_Az    0.056868
14          t_mean_Gy    0.051155
15            f_Q1_Ax    0.048915
16          t_mean_Gx    0.032505
17      t_skewness_Az    0.029633
18      t_skewness_Ax    0.023745
19      t_kurtosis_Gz    0.023675
20      t_skewness_Gy    0.020575
21      f_variance_Gx    0.014517
22      f_variance_Gy    0.012160
23      f_variance_Gz    0.009442
24          t_mean_Gz    0.00

# 2. C 202

In [3]:
#Load dataset as pandas data frame
filename = "centre_both.csv"
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')   

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0           t_mean_Ay    0.892635
1   t_Crest_factor_Gz    0.366428
2   t_Crest_factor_Gy    0.330982
3           t_mean_Az    0.265255
4       t_kurtosis_Az    0.199405
5       t_skewness_Ay    0.196483
6       t_skewness_Gx    0.134665
7       t_kurtosis_Gz    0.111864
8       f_variance_Gx    0.111046
9       t_kurtosis_Ax    0.093200
10      t_kurtosis_Gx    0.081011
11          t_mean_Gy    0.078394
12           f_IQR_Az    0.077315
13      t_kurtosis_Gy    0.070696
14      t_skewness_Gy    0.043170
15      t_skewness_Az    0.032906
16      t_skewness_Ax    0.013089
17      t_skewness_Gz    0.010383
18      f_variance_Gy    0.009470
19  t_Crest_factor_Az    0.008228
20          t_mean_Gx    0.004855
21          t_mean_Gz    0.004785
22      t_variance_Ax    0.003958
['t_mean_Ay', 't_Crest_factor_Gz', 't_Crest_factor_Gy', 't_mean

# 3. L 202

In [5]:
#Load dataset as pandas data frame
filename = 'left_both.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')   

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0            t_std_Gx    0.374021
1   t_Crest_factor_Gy    0.300990
2   t_Crest_factor_Gz    0.209767
3       t_kurtosis_Ax    0.201654
4           t_mean_Az    0.191012
5       t_kurtosis_Ay    0.151359
6       t_kurtosis_Az    0.139311
7           t_mean_Ax    0.119069
8       t_skewness_Gz    0.114926
9       t_kurtosis_Gy    0.108051
10          t_mean_Gz    0.099613
11      t_skewness_Az    0.083611
12      t_kurtosis_Gx    0.044731
13            f_Q1_Ax    0.037596
14      t_kurtosis_Gz    0.036529
15            f_Q1_Ay    0.032553
16          t_mean_Gy    0.030143
17          t_mean_Gx    0.029421
18          t_mean_Ay    0.024629
19      f_variance_Gx    0.015039
20      t_skewness_Gy    0.011618
21      t_skewness_Gx    0.007027
22      f_variance_Gy    0.003401
23      t_skewness_Ax    0.002070
['t_std_Gx', 't_Crest_factor_

# 4. R 202

In [6]:
#Load dataset as pandas data frame
filename = 'right_both.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')      

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0           t_mean_Ay    0.681472
1            t_std_Gx    0.382075
2   t_Crest_factor_Gy    0.313831
3           t_mean_Ax    0.306845
4   t_Crest_factor_Gz    0.225561
5       t_kurtosis_Ax    0.213614
6           t_mean_Az    0.211492
7             f_Q1_Ay    0.167735
8       t_kurtosis_Az    0.145834
9       t_skewness_Gz    0.118359
10      t_kurtosis_Gy    0.115324
11          t_mean_Gz    0.104830
12      t_skewness_Ax    0.080060
13      f_variance_Gz    0.074364
14      t_skewness_Ay    0.073285
15      f_variance_Gx    0.051697
16            f_Q1_Ax    0.048555
17          t_mean_Gx    0.046560
18      t_skewness_Gx    0.046246
19          t_mean_Gy    0.038768
20      t_kurtosis_Gx    0.036651
21      f_variance_Gy    0.036031
22      t_kurtosis_Gz    0.025429
23      t_skewness_Az    0.019017
24      t_skewness_Gy    0.00

# 5. CL 202

In [7]:
#Load dataset as pandas data frame
filename = 'CL_both_202.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')     

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0            t_std_Gx    0.364619
1   t_Crest_factor_Gy    0.311454
2   t_Crest_factor_Gz    0.293485
3           t_mean_Az    0.234338
4       t_kurtosis_Ay    0.209613
5       t_kurtosis_Az    0.170695
6       t_kurtosis_Ax    0.143661
7       t_skewness_Ay    0.124805
8             f_Q1_Ay    0.120271
9           t_mean_Ay    0.105026
10      t_kurtosis_Gy    0.091462
11          t_mean_Ax    0.066357
12      t_skewness_Gx    0.058520
13          t_mean_Gy    0.056213
14      t_skewness_Gz    0.053810
15      f_variance_Gx    0.046632
16  t_Crest_factor_Az    0.045637
17      t_kurtosis_Gz    0.040732
18          t_mean_Gz    0.039471
19      t_skewness_Gy    0.028957
20      t_skewness_Az    0.027161
21          t_mean_Gx    0.020222
22      t_kurtosis_Gx    0.018569
23      t_skewness_Ax    0.007808
24      f_variance_Gy    0.00

# 6. CR 202

In [8]:
#Load dataset as pandas data frame
filename = 'CR_both_202.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')      

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0           t_mean_Ay    0.757342
1            t_std_Gx    0.365892
2   t_Crest_factor_Gy    0.315936
3   t_Crest_factor_Gz    0.298926
4           t_mean_Az    0.238681
5           t_mean_Ax    0.182010
6       t_kurtosis_Az    0.171911
7             f_Q1_Ay    0.171663
8       t_kurtosis_Ax    0.155536
9       t_skewness_Ay    0.132511
10      t_skewness_Gx    0.091512
11      t_kurtosis_Gy    0.090136
12          t_mean_Gy    0.058150
13      t_skewness_Gz    0.056084
14          t_mean_Gz    0.054459
15      t_kurtosis_Gz    0.050013
16      t_skewness_Ax    0.043072
17  t_Crest_factor_Az    0.041592
18      f_variance_Gz    0.040691
19            f_Q3_Ax    0.039053
20      f_variance_Gx    0.030553
21          t_mean_Gx    0.028659
22      f_variance_Gy    0.027853
23      t_skewness_Gy    0.019397
24      t_kurtosis_Gx    0.01

# 7. LR 202

In [9]:
#Load dataset as pandas data frame
filename = 'LR_both_202.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')     

Fature Ranking information
---------------------------------------------------------------------------------------------
              feature  importance
0            t_std_Gx    0.379937
1   t_Crest_factor_Gy    0.300106
2   t_Crest_factor_Gz    0.221806
3       t_kurtosis_Ax    0.212675
4           t_mean_Az    0.205662
5       t_kurtosis_Ay    0.167969
6       t_kurtosis_Az    0.140898
7       t_kurtosis_Gy    0.115344
8           t_mean_Ay    0.110007
9             f_Q1_Ay    0.100507
10          t_mean_Ax    0.080866
11      t_skewness_Ay    0.066386
12      t_skewness_Az    0.052049
13          t_mean_Gx    0.044216
14      t_kurtosis_Gx    0.042022
15      t_skewness_Ax    0.038296
16            f_Q1_Ax    0.037819
17          t_mean_Gy    0.036306
18      f_variance_Gx    0.031432
19      t_kurtosis_Gz    0.030132
20      t_skewness_Gx    0.023294
21      f_variance_Gy    0.015630
22      f_variance_Gz    0.012267
23      t_skewness_Gy    0.010211
24      t_skewness_Gz    0.00

# 8. CLR 606

In [10]:
#Load dataset as pandas data frame
filename = 'CLR_both_606.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')  

Fature Ranking information
---------------------------------------------------------------------------------------------
            feature  importance
0       R_t_mean_Ax    0.061716
1         C_f_Q1_Ay    0.045072
2       C_t_mean_Gx    0.042654
3       R_t_mean_Ay    0.032141
4       L_t_mean_Gz    0.031806
5        L_t_std_Gx    0.029716
6   C_t_variance_Ax    0.027469
7       R_t_mean_Gz    0.026112
8   C_t_skewness_Az    0.025879
9        L_t_std_Gy    0.018739
10  R_t_skewness_Az    0.017907
11      L_t_mean_Ax    0.017483
12      C_t_mean_Az    0.017329
13  R_t_skewness_Gz    0.017088
14       C_f_IQR_Az    0.016851
15  C_t_variance_Ay    0.016844
16  R_t_skewness_Gy    0.016700
17       C_t_std_Gx    0.016542
18  C_t_kurtosis_Gz    0.015260
19  L_f_variance_Gz    0.015008
20  L_t_skewness_Gx    0.014600
21  R_t_kurtosis_Gx    0.014318
22      L_t_mean_Az    0.013661
23  R_t_kurtosis_Ay    0.013387
24  C_t_kurtosis_Az    0.013190
25      R_t_mean_Gy    0.012960
26  R_t_kurtosi

# 9. CL 404

In [12]:
#Load dataset as pandas data frame
filename = 'CL_both_404.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')  

Fature Ranking information
---------------------------------------------------------------------------------------------
            feature  importance
0         C_f_Q1_Ay    0.046791
1   C_t_variance_Ay    0.032198
2       C_t_mean_Gx    0.031953
3       C_t_mean_Az    0.028110
4       L_t_mean_Gz    0.023688
5   C_t_skewness_Az    0.023446
6   L_t_skewness_Az    0.023240
7        L_t_std_Gx    0.021365
8        C_f_IQR_Az    0.021344
9   L_t_kurtosis_Gy    0.020369
10  C_t_variance_Ax    0.016207
11      L_t_mean_Az    0.014290
12       C_t_std_Gx    0.014111
13  L_t_skewness_Gx    0.013753
14  C_t_kurtosis_Gz    0.012685
15  L_f_variance_Gx    0.012466
16      L_t_mean_Ax    0.012236
17        L_f_Q1_Ay    0.012192
18      L_t_mean_Gy    0.011834
19      C_t_mean_Gz    0.011466
20  L_t_kurtosis_Az    0.011202
21  C_t_kurtosis_Az    0.010681
22  L_f_variance_Gz    0.010503
23      C_f_mean_Gz    0.009738
24        L_f_Q1_Ax    0.009472
25  C_t_kurtosis_Gy    0.008928
26      C_f_mea

# 10. CR 404

In [14]:
#Load dataset as pandas data frame
filename = 'CR_both_404.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')   

Fature Ranking information
---------------------------------------------------------------------------------------------
            feature  importance
0       R_t_mean_Ax    0.069289
1         C_f_Q1_Ay    0.042156
2       C_t_mean_Gx    0.035609
3   C_t_variance_Ax    0.030929
4   C_t_skewness_Az    0.030857
5       R_t_mean_Ay    0.026200
6       C_t_mean_Az    0.024570
7       R_t_mean_Gz    0.023939
8   R_t_skewness_Az    0.021900
9   C_t_kurtosis_Gz    0.020255
10       R_t_std_Gy    0.018617
11  C_t_variance_Ay    0.016914
12  R_t_kurtosis_Az    0.015552
13  R_t_skewness_Gy    0.015426
14        R_f_Q1_Ax    0.014831
15  R_f_variance_Gz    0.014493
16       C_f_IQR_Az    0.013719
17  C_t_kurtosis_Az    0.013691
18       C_t_std_Gx    0.013046
19        R_f_Q1_Ay    0.012886
20      C_f_mean_Ay    0.010258
21  C_t_kurtosis_Gy    0.009874
22  C_f_variance_Gy    0.009365
23  C_t_kurtosis_Ay    0.009164
24  R_t_kurtosis_Ay    0.009038
25      C_t_mean_Gz    0.008752
26  R_t_skewnes

# 11. LR 404

In [16]:
filename = 'LR_both_404.csv'
dataset = pd.read_csv(filename)

#Split data into input and output variable
X = dataset.iloc[:,0:dataset.shape[1]-1]
Y = dataset.iloc[:,-1]

X_trainB, X_testB, y_trainB, y_testB = train_test_split(X, Y,test_size=0.3,random_state = 100)
X_trainA, X_testA, y_trainA, y_testA = train_test_split(X, Y,test_size=0.3,random_state = 100)

# Removing Constant features
constant_filter = VarianceThreshold()
constant_filter.fit(X_trainA)
constant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[constant_filter.get_support()]]
X_trainA.drop(labels=constant_columns,axis=1, inplace=True) 
X_testA.drop(labels=constant_columns,axis=1, inplace=True)

# Removing Quasi-Constant features
qconstant_filter = VarianceThreshold(0.01)
qconstant_filter.fit(X_trainA)
qconstant_columns = [col for col in X_trainA.columns
                    if col not in X_trainA.columns[qconstant_filter.get_support()]]
X_trainA.drop(labels=qconstant_columns,axis=1, inplace=True) 
X_testA.drop(labels=qconstant_columns,axis=1, inplace=True)

# Removing Correlated Features
correlated_features = set()  
correlation_matrix = X_trainA.corr(method = 'pearson')
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.4:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
X_trainA.drop(labels=correlated_features,axis=1, inplace=True) 
X_testA.drop(labels=correlated_features,axis=1, inplace=True)

# feature ranking and selection    
ranking_info, selected_features = feature_ranking_selection(X_trainA, y_trainA, 15)
print('Fature Ranking information')
print('---------------------------------------------------------------------------------------------')
print(ranking_info)
print(list(ranking_info['feature']))
print('---------------------------------------------------------------------------------------------')
X_trainA = X_trainA[selected_features]        
X_testA = X_testA[selected_features] 

#names = ["Nearest Neighbors","Decision Tree","Naive Bayes"]
names = ["Decision Tree"]

classifiers = [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(random_state = 100)
    #GaussianNB(),
   ]
classifier2= [
    #KNeighborsClassifier(5, n_jobs= -1 ),
    DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 50,random_state = 100)
    #GaussianNB(),
   ]
clf_bef = list()
clf_aft = list()

for name, clf, dlf in zip(names,classifiers,classifier2):
    # clf for before
    # dlf for after
    
    # Before Feature Selection
    startB = time.time()
    clf.fit(X_trainB,y_trainB)    
    endB = time.time() 
    clf_bef.append(clf)  
    
    # after Feature Selection    
    startA = time.time()
    dlf.fit(X_trainA,y_trainA)
    endA = time.time()
    clf_aft.append(dlf)
        
    print('\t\t\t\tClassifier:',name.upper())
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
         
    print('\t\tBefore Feature Selection\tAfter Feature Selection')
    print('No. of features:\t', X_trainB.shape[1],'\t\t\t',X_trainA.shape[1])
    #print("Dataset Size(in MB):\t",(X_trainB.values.nbytes/1e6),'\t\t',(X_trainA.values.nbytes/1e6))

    # training accuracy
    train_predB = clf.predict(X_trainB)
    train_predA = dlf.predict(X_trainA)
    train_accB = round(accuracy_score(y_trainB,train_predB)*100, 2)
    train_accA = round(accuracy_score(y_trainA,train_predA)*100, 2)
    print('Train Accuracy:\t\t',train_accB,'\t\t\t',train_accA)

    # test accuracy 
    test_predB = clf.predict(X_testB)
    test_predA = dlf.predict(X_testA)
    test_accB = round(accuracy_score(y_testB,test_predB)*100, 2)
    test_accA = round(accuracy_score(y_testA,test_predA)*100, 2)
    print('Test Accuracy:\t\t',test_accB,'\t\t\t',test_accA)
        
    # roc_auc_score
    test_roc_aucB = round(roc_auc_score(y_testB,test_predB), 2)
    test_roc_aucA = round(roc_auc_score(y_testA,test_predA), 2)
    print('ROC AUC score:\t\t',test_roc_aucB,'\t\t\t',test_roc_aucA)

    # f1 score
    test_f1B = round(f1_score(y_testB,test_predB),2)
    test_f1A = round(f1_score(y_testA,test_predA),2)
    print('f1_score:\t\t',test_f1B,'\t\t\t',test_f1A)

    # precision
    test_precB = round(precision_score(y_testB,test_predB),2)
    test_precA = round(precision_score(y_testA,test_predA),2)
    print('Precision:\t\t',test_precB,'\t\t\t',test_precA)

    # recall
    test_recallB = round(recall_score(y_testB,test_predB),2)
    test_recallA = round(recall_score(y_testA,test_predA),2)
    print('Recall:\t\t\t',test_recallB,'\t\t\t',test_recallA)

    # Matthews correlation coefficient
    test_MCCB = round(matthews_corrcoef(y_testB,test_predB),2)
    test_MCCA = round(matthews_corrcoef(y_testA,test_predA),2)
    print('MCC:\t\t\t',test_MCCB,'\t\t\t',test_MCCA)
           
    # training time
    timeB = round((float(endB)- float(startB)),2)
    timeA = round((float(endA)- float(startA)),2)
    print('Train Time (in seconds):',timeB,'\t\t\t',timeA)
       
    # confusion matrix
    cm_resultB = confusion_matrix(y_testB,test_predB)
    cm_resultA = confusion_matrix(y_testA,test_predA)
    print('Confusion Matrix(Before):\n',cm_resultB)
    print('Confusion Matrix(after):\n',cm_resultA)
           
    print('---------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------')
    print('\n')   

Fature Ranking information
---------------------------------------------------------------------------------------------
            feature  importance
0       R_t_mean_Ax    0.067492
1       R_t_mean_Ay    0.028052
2   R_t_skewness_Az    0.026928
3        L_t_std_Gx    0.026367
4       L_t_mean_Gz    0.025095
5   R_t_kurtosis_Gy    0.021130
6       R_t_mean_Gz    0.020045
7   R_t_kurtosis_Gx    0.017984
8         L_f_Q1_Ay    0.017468
9   R_t_kurtosis_Az    0.015303
10      L_t_mean_Ax    0.015276
11  R_t_skewness_Gz    0.015097
12      R_t_mean_Az    0.014886
13  L_t_skewness_Az    0.014066
14  L_t_kurtosis_Gy    0.014054
15  R_f_variance_Gz    0.013357
16  L_t_kurtosis_Gz    0.013048
17      L_t_mean_Az    0.012387
18  L_t_skewness_Gx    0.012260
19        R_f_Q1_Ax    0.011180
20      R_t_mean_Gy    0.009919
21  R_t_skewness_Gy    0.009317
22       R_t_std_Gx    0.008905
23  L_f_variance_Gz    0.008889
24       R_t_std_Gy    0.008728
25  R_t_skewness_Ax    0.008038
26       L_t_st