In [24]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd

### Read dataset from file

In [2]:
y = []
x = []
with open("./datafile.txt") as dt:
    for line in dt:
        line = line.strip().split(" ")
        
        if line[0] == "+1":
            y.append(1)
        else:
            y.append(0)
        tmp = []
        for i in range(1, 51):
            #print(line[i])
            tmp.append(line[i].split(":")[1])
        
        x.append(tmp)

In [50]:
feature_name = ["T1","T2","T3","T4","T5","PE1","PE2","PE3","PE4","PE5","PE6","PE7","PE8","PE9","PE10","R1","R2","R3","R4","R5","R6","R7","R8","R9","R10","SS1","SS2","SS3","SS4","SS5","SS6","G1","G2","G3","G4","G5","G6","G7","G8","G9","G10","G11","C1","C2","C3","C4","C5","C6","C7","C8"]

### Feature selection

In [72]:
X = np.array(x, dtype="float64")
Y = np.array(y)
res = {}
estimator = LogisticRegression()
selector = RFECV(estimator, step=1, cv=10, n_jobs=10)
selector = selector.fit(X, Y)
res['LR'] = {k: v for k, v in zip(feature_name, selector.support_) }
estimator = DecisionTreeClassifier()
selector = RFECV(estimator, step=1, cv=10, n_jobs=10)
selector = selector.fit(X, Y)
res['DT'] = {k: v for k, v in zip(feature_name, selector.support_) }
pd.DataFrame(res)

Unnamed: 0,DT,LR
C1,False,True
C2,False,True
C3,False,True
C4,False,True
C5,False,True
C6,False,True
C7,False,True
C8,False,True
G1,False,True
G10,True,True


### SVM

#### 10-Fold CV w/ SVM using linear kernel

In [73]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = KFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
        
    clf = SVC(kernel="linear")
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [74]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.7,0.7,0.75,0.6,0.8,0.65,0.65,0.8,0.55,0.7,0.69
F1,0.727273,0.727273,0.736842,0.6,0.8,0.695652,0.666667,0.866667,0.666667,0.75,0.723704
PRE,0.615385,0.666667,0.7,0.461538,0.727273,0.727273,0.538462,0.8125,0.692308,0.642857,0.658426
REC,0.888889,0.8,0.777778,0.857143,0.888889,0.666667,0.875,0.928571,0.642857,0.9,0.822579


#### 10-Fold Stratified CV w/ SVM using linear kernel

In [75]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = StratifiedKFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X, Y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
     
    clf = SVC(kernel="linear")
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [76]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.714286,0.714286,0.85,0.75,0.75,0.6,0.7,0.6,0.526316,0.736842,0.694173
F1,0.769231,0.785714,0.857143,0.782609,0.782609,0.636364,0.769231,0.692308,0.608696,0.761905,0.744581
PRE,0.666667,0.647059,0.818182,0.692308,0.692308,0.583333,0.625,0.5625,0.538462,0.727273,0.655309
REC,0.909091,1.0,0.9,0.9,0.9,0.7,1.0,0.9,0.7,0.8,0.870909


### Logistic Regression

#### 10-Fold CV w/ Logistic Regression

In [7]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = KFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [8]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.7,0.65,0.6,0.75,0.5,0.65,0.65,0.65,0.6,0.45,0.62
F1,0.785714,0.695652,0.692308,0.814815,0.5,0.695652,0.740741,0.695652,0.6,0.521739,0.674227
PRE,0.733333,0.727273,0.5625,0.785714,0.416667,0.615385,0.714286,0.571429,0.428571,0.4,0.595516
REC,0.846154,0.666667,0.9,0.846154,0.625,0.8,0.769231,0.888889,1.0,0.75,0.809209


#### 10-Fold Stratified CV w/ Logistic Regression

In [9]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = StratifiedKFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X, Y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [10]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.761905,0.666667,0.75,0.55,0.55,0.65,0.75,0.7,0.473684,0.473684,0.632594
F1,0.8,0.72,0.8,0.689655,0.571429,0.72,0.782609,0.727273,0.545455,0.545455,0.690187
PRE,0.714286,0.642857,0.666667,0.526316,0.545455,0.6,0.692308,0.666667,0.5,0.5,0.605455
REC,0.909091,0.818182,1.0,1.0,0.6,0.9,0.9,0.8,0.6,0.6,0.812727


### Random Forest

#### 10-Fold CV w/ Random Forest

In [78]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = KFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    clf = RandomForestClassifier(max_depth=5, n_estimators=20, random_state=0, n_jobs=5)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [79]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.75,0.8,0.75,0.8,0.8,0.6,0.55,0.7,0.5,0.85,0.71
F1,0.814815,0.875,0.761905,0.818182,0.818182,0.692308,0.470588,0.75,0.285714,0.857143,0.714384
PRE,0.785714,0.823529,0.666667,0.692308,0.9,0.5625,0.4,0.692308,0.285714,0.75,0.655874
REC,0.846154,0.933333,0.888889,1.0,0.75,0.9,0.571429,0.818182,0.285714,1.0,0.79937


#### 10-Fold Stratified CV w/ Random Forest

In [80]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = StratifiedKFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X, Y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    clf = RandomForestClassifier(max_depth=5, n_estimators=20, random_state=0, n_jobs=5)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [81]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.714286,0.714286,0.7,0.7,0.75,0.8,0.8,0.55,0.736842,0.736842,0.720226
F1,0.769231,0.75,0.75,0.666667,0.782609,0.818182,0.8,0.608696,0.782609,0.782609,0.75106
PRE,0.666667,0.692308,0.642857,0.75,0.692308,0.75,0.8,0.538462,0.692308,0.692308,0.691722
REC,0.909091,0.818182,0.9,0.6,0.9,0.9,0.8,0.7,0.9,0.9,0.832727


### Adaboost

#### 10-Fold CV w/ adaBoost

In [34]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = KFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    est = LogisticRegression()
    clf = AdaBoostClassifier(learning_rate=0.01, base_estimator=est)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [35]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.6,0.75,0.55,0.8,0.8,0.65,0.7,0.55,0.6,0.65,0.665
F1,0.666667,0.761905,0.571429,0.833333,0.866667,0.72,0.75,0.64,0.714286,0.758621,0.728291
PRE,0.5,0.615385,0.6,0.833333,0.764706,0.5625,0.642857,0.470588,0.588235,0.647059,0.622466
REC,1.0,1.0,0.545455,0.833333,1.0,1.0,0.9,1.0,0.909091,0.916667,0.910455


#### 10-Fold Stratified CV w/ adaBoost

In [82]:
X = np.array(x, dtype="float64")
Y = np.array(y)
kf = StratifiedKFold(n_splits=10, shuffle=True)
record = {}
fold = 1
for train_index, test_index in kf.split(X, Y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    est = LogisticRegression()
    clf = AdaBoostClassifier(learning_rate=0.01, base_estimator=est)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    record[fold] = {"ACC":acc, "PRE":pre, "REC": rec, "F1": f1}
    fold += 1
    
print("10-Fold CV Finished!")
record["AVG"] ={
    "ACC":np.mean([v['ACC'] for k, v in record.items()]), 
    "PRE":np.mean([v['PRE'] for k, v in record.items()]),  
    "REC": np.mean([v['REC'] for k, v in record.items()]),  
    "F1": np.mean([v['F1'] for k, v in record.items()])
}

10-Fold CV Finished!


In [83]:
pd.DataFrame(record)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,AVG
ACC,0.619048,0.761905,0.7,0.75,0.6,0.6,0.7,0.8,0.684211,0.578947,0.679411
F1,0.714286,0.8,0.727273,0.8,0.692308,0.692308,0.75,0.833333,0.75,0.714286,0.747379
PRE,0.588235,0.714286,0.666667,0.666667,0.5625,0.5625,0.642857,0.714286,0.642857,0.555556,0.631641
REC,0.909091,0.909091,0.8,1.0,0.9,0.9,0.9,1.0,0.9,1.0,0.921818


In [27]:
clf.feature_importances_

array([0.01473701, 0.05516657, 0.26068903, 0.07875163, 0.05438041,
       0.08016524, 0.03463198, 0.        , 0.        , 0.        ,
       0.01178961, 0.        , 0.        , 0.00631586, 0.        ,
       0.        , 0.05663701, 0.02094207, 0.        , 0.        ,
       0.01105276, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.07979315,
       0.01913898, 0.        , 0.        , 0.        , 0.02873718,
       0.02110072, 0.        , 0.        , 0.        , 0.03403614,
       0.0469776 , 0.01842127, 0.        , 0.        , 0.        ,
       0.01473701, 0.        , 0.05179875, 0.        , 0.        ])