In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings

from collections import Counter
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, confusion_matrix

from IPython.core.interactiveshell import InteractiveShell
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
cat_feat = pickle.load(open('./data/01_cat_feat.pkl', 'rb'))
num_feat = pickle.load(open('./data/01_num_feat.pkl', 'rb'))
comb_feat = pickle.load(open('./data/01_comb_feat.pkl', 'rb'))
target = pickle.load(open('./data/01_target.pkl', 'rb'))

# Baseline model

## Validation with K-fold

In [3]:
def validation(X_all, y_all, models):
    
    res = []
    
    for model_class in models:
        model = model_class()
        model_name = model.__class__.__name__
        
        acc = []
        f1  = []
        precision = []
        recall = []
        
        skf = KFold(n_splits=10, shuffle=False, random_state=100)
        
        for train_idx, test_idx in skf.split(X_all, y_all):
            model = model_class()
            model.fit(X_all[train_idx], y_all[train_idx])
            
            pred = model.predict(X_all[test_idx])
            acc.append(accuracy_score(y_all[test_idx], pred))
            f1.append(f1_score(y_all[test_idx], pred))
            precision.append(precision_score(y_all[test_idx], pred))
            recall.append(recall_score(y_all[test_idx], pred))
        
        res.append({
            "model": model_name,
            "acc (cv=10)": np.mean(acc),
            "f1 (cv=10)": np.mean(f1),
            "precision (cv=10)": np.mean(precision),
            "recall (cv=10)": np.mean(recall)
        })
        
    return res

In [4]:
models = [SVC, XGBClassifier, LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier]

* combine features

In [5]:
pd.DataFrame(validation(comb_feat, target, models))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,SVC,0.788387,0.529896,0.56875,0.503387
1,XGBClassifier,0.761828,0.512293,0.571429,0.471828
2,LogisticRegression,0.82129,0.538077,0.573333,0.512957
3,KNeighborsClassifier,0.771828,0.525706,0.56875,0.496828
4,DecisionTreeClassifier,0.696237,0.476899,0.564706,0.427903
5,RandomForestClassifier,0.765269,0.519482,0.56875,0.486935
6,GradientBoostingClassifier,0.761398,0.522102,0.573333,0.489731


* categorical features

In [6]:
pd.DataFrame(validation(cat_feat, target, models))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,SVC,0.788172,0.529274,0.56875,0.503172
1,XGBClassifier,0.77172,0.519006,0.571429,0.48172
2,LogisticRegression,0.81129,0.533461,0.573333,0.50629
3,KNeighborsClassifier,0.764731,0.544697,0.566667,0.534731
4,DecisionTreeClassifier,0.718817,0.500624,0.564706,0.463817
5,RandomForestClassifier,0.775054,0.517203,0.564706,0.490054
6,GradientBoostingClassifier,0.76172,0.522534,0.56875,0.493387


* numeric features

In [7]:
pd.DataFrame(validation(num_feat, target, models))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,SVC,0.672473,0.5073,0.563158,0.47914
1,XGBClassifier,0.616882,0.474233,0.573333,0.415215
2,LogisticRegression,0.643333,0.485049,0.570588,0.44
3,KNeighborsClassifier,0.606559,0.469896,0.5625,0.416559
4,DecisionTreeClassifier,0.593441,0.456584,0.564706,0.401774
5,RandomForestClassifier,0.626989,0.46759,0.566667,0.416989
6,GradientBoostingClassifier,0.607097,0.464587,0.570588,0.41043


In [8]:
X_train, X_test, y_train, y_test = train_test_split(comb_feat, target, test_size=0.2, random_state=100)

# Model tuning

In [9]:
# !pip install scikit-optimize

In [10]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

* Support vector machine

In [11]:
%%time

svc_opt = BayesSearchCV(
    SVC(),
    {
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1, 8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
    n_iter=32,
    random_state=100,
#     verbose=1,
    n_jobs=-1
)
_ = svc_opt.fit(X_train, y_train)
print('Best params for SVC:', svc_opt.best_params_)

Best params for SVC: OrderedDict([('degree', 4), ('gamma', 0.09097412253862658), ('kernel', 'linear')])
Wall time: 17.5 s


In [12]:
acc = []
f1  = []
precision = []
recall = []

skf = KFold(n_splits=10, shuffle=False, random_state=100)

for train_idx, test_idx in skf.split(X_train, y_train):
    model = SVC(**svc_opt.best_params_)
    _ = model.fit(X_train[train_idx], y_train[train_idx])

    pred = model.predict(X_train[test_idx])
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))

pd.DataFrame({
    "model": ['SVC'],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred = svc_opt.predict(X_test)

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,SVC,0.817667,0.833096,0.807701,0.867596


Accuracy on testset: 0.9016393442622951
F1 on testset: 0.8999999999999999
Precision on testset: 0.84375
Recall on testset: 0.9642857142857143
              precision    recall  f1-score   support

           0       0.97      0.85      0.90        33
           1       0.84      0.96      0.90        28

    accuracy                           0.90        61
   macro avg       0.90      0.91      0.90        61
weighted avg       0.91      0.90      0.90        61

[[28  5]
 [ 1 27]]


* K-Nearest Neighbors

In [13]:
%%time

knn_opt = BayesSearchCV(
    KNeighborsClassifier(),
    {
        'n_neighbors': Integer(5, 10),
        'leaf_size': Integer(25, 35),
        'p': Integer(2, 10)
    },
    n_iter=32,
    random_state=100,
#     verbose=1,
    n_jobs=-1
)
_ = knn_opt.fit(X_train, y_train)
print('Best params for KNN:', knn_opt.best_params_)

Best params for KNN: OrderedDict([('leaf_size', 31), ('n_neighbors', 5), ('p', 8)])
Wall time: 11 s


In [14]:
acc = []
f1  = []
precision = []
recall = []

skf = KFold(n_splits=10, shuffle=False, random_state=100)

for train_idx, test_idx in skf.split(X_train, y_train):
    model = KNeighborsClassifier(**knn_opt.best_params_)
    _ = model.fit(X_train[train_idx], y_train[train_idx])

    pred = model.predict(X_train[test_idx])
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))

pd.DataFrame({
    "model (cv=10)": ['KNeighborsClassifier'],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred = knn_opt.predict(X_test)

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model (cv=10),acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,KNeighborsClassifier,0.817833,0.836632,0.830615,0.853248


Accuracy on testset: 0.8360655737704918
F1 on testset: 0.8437499999999999
Precision on testset: 0.75
Recall on testset: 0.9642857142857143
              precision    recall  f1-score   support

           0       0.96      0.73      0.83        33
           1       0.75      0.96      0.84        28

    accuracy                           0.84        61
   macro avg       0.85      0.85      0.84        61
weighted avg       0.86      0.84      0.84        61

[[24  9]
 [ 1 27]]


* Logistic Regression

In [15]:
%%time

lr_opt = BayesSearchCV(
    LogisticRegression(),
    {
        'C': Real(1e-6, 1e6, prior='log-uniform'),
        'max_iter': Integer(100, 200),
        'solver': Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    },
    n_iter=32,
    random_state=100,
#     verbose=1,
    n_jobs=-1
)
_ = lr_opt.fit(X_train, y_train)
print('Best params for LR:', lr_opt.best_params_)

Best params for LR: OrderedDict([('C', 8.541524145010282), ('max_iter', 107), ('solver', 'saga')])
Wall time: 17.7 s


In [16]:
acc = []
f1  = []
precision = []
recall = []

skf = KFold(n_splits=10, shuffle=False, random_state=100)

for train_idx, test_idx in skf.split(X_train, y_train):
    model = LogisticRegression(**lr_opt.best_params_)
    _ = model.fit(X_train[train_idx], y_train[train_idx])

    pred = model.predict(X_train[test_idx])
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))

pd.DataFrame({
    "model": ['LogisticRegression'],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred = lr_opt.predict(X_test)

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,LogisticRegression,0.830167,0.841212,0.839939,0.851363


Accuracy on testset: 0.9016393442622951
F1 on testset: 0.8999999999999999
Precision on testset: 0.84375
Recall on testset: 0.9642857142857143
              precision    recall  f1-score   support

           0       0.97      0.85      0.90        33
           1       0.84      0.96      0.90        28

    accuracy                           0.90        61
   macro avg       0.90      0.91      0.90        61
weighted avg       0.91      0.90      0.90        61

[[28  5]
 [ 1 27]]


# Stacking models

In [17]:
def train(X, y, models):    
    res_model = []
    
    for model_class in models:
        model = model_class()
        _ = model.fit(X, y)
        
        res_model.append(model)
        
    return res_model

In [18]:
def stack_output(X, models):
    res = []
    for model in models:
        pred = model.predict(X).reshape((-1, 1))
        res.append(pred)
    return np.hstack(res)

def stack_output_v2(X, models):
    res = []
    for model in models:
        pred = model.predict(X).reshape((-1, 1))
        res.append(pred)
    return np.hstack(res + [X])

* Simple stacking

In [19]:
skf = StratifiedKFold(n_splits=10, shuffle=False, random_state=100)

In [20]:
acc = []
f1  = []
precision = []
recall = []

for train_idx, test_idx in skf.split(X_train, y_train):
    models = [SVC, LogisticRegression, KNeighborsClassifier]
    base_models = train(X_train[train_idx], y_train[train_idx], models)
    
    train_out_base = stack_output(X_train[train_idx], base_models)
    
    meta_model = SVC()
    _ = meta_model.fit(train_out_base, y_train[train_idx])
    
    test_out_base = stack_output(X_train[test_idx], base_models)
    
    test_set_base = stack_output(X_test, base_models)
    
    pred = meta_model.predict(test_out_base)
    
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))
    
pd.DataFrame({
    "model": ["Stacking 1 meta-learner"],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred = meta_model.predict(test_set_base)

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,Stacking 1 meta-learner,0.814,0.838989,0.823572,0.869231


Accuracy on testset: 0.819672131147541
F1 on testset: 0.8253968253968255
Precision on testset: 0.7428571428571429
Recall on testset: 0.9285714285714286
              precision    recall  f1-score   support

           0       0.92      0.73      0.81        33
           1       0.74      0.93      0.83        28

    accuracy                           0.82        61
   macro avg       0.83      0.83      0.82        61
weighted avg       0.84      0.82      0.82        61

[[24  9]
 [ 2 26]]


* majority vote

In [21]:
acc = []
f1  = []
precision = []
recall = []

for train_idx, test_idx in skf.split(X_train, y_train):
    models = [SVC, LogisticRegression, KNeighborsClassifier]
    base_models = train(X_train[train_idx], y_train[train_idx], models)
    
    train_out_base = stack_output(X_train[train_idx], base_models)
    
    test_out_base = stack_output(X_train[test_idx], base_models)
    
    test_set_base = stack_output(X_test, base_models)
    
    pred = []

    for row in test_out_base:
        pred.append(Counter(row).most_common(1)[0][0])

    pred = np.array(pred).reshape((-1, 1))
    
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))
    
pd.DataFrame({
    "model": ["Ensemble majority vote"],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred = []

for row in test_set_base:
    pred.append(Counter(row).most_common(1)[0][0])
    
pred = np.array(pred).reshape((-1, 1))

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,Ensemble majority vote,0.8265,0.85021,0.831846,0.875275


Accuracy on testset: 0.8688524590163934
F1 on testset: 0.8666666666666666
Precision on testset: 0.8125
Recall on testset: 0.9285714285714286
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61

[[27  6]
 [ 2 26]]


* 3 meta and majorityy vote

In [22]:
acc = []
f1  = []
precision = []
recall = []

for train_idx, test_idx in skf.split(X_train, y_train):
    models = [SVC, LogisticRegression, KNeighborsClassifier]
    base_models = train(X_train[train_idx], y_train[train_idx], models)
    
    train_out_base1 = stack_output(X_train[train_idx], base_models)
    train_out_base2 = stack_output_v2(X_train[train_idx], base_models)
    
    test_out_base1 = stack_output(X_train[test_idx], base_models)
    test_out_base2 = stack_output_v2(X_train[test_idx], base_models)
    
    test_set_base1 = stack_output(X_test, base_models)
    test_set_base2 = stack_output_v2(X_test, base_models)
    
    meta_model1 = SVC()
    meta_model2 = GradientBoostingClassifier()
    meta_model3 = LogisticRegression()

    _ = meta_model1.fit(train_out_base1, y_train[train_idx])
    _ = meta_model2.fit(train_out_base2, y_train[train_idx])
    _ = meta_model3.fit(X_train[train_idx], y_train[train_idx])

    pred1 = meta_model1.predict(test_out_base1).reshape((-1, 1))
    pred2 = meta_model2.predict(test_out_base2).reshape((-1, 1))
    pred3 = meta_model3.predict(X_train[test_idx]).reshape((-1, 1))

    preds = np.hstack([pred1, pred2, pred3])
    
    pred = []

    for row in preds:
        pred.append(Counter(row).most_common(1)[0][0])

    pred = np.array(pred).reshape((-1, 1))
    
    acc.append(accuracy_score(y_train[test_idx], pred))
    f1.append(f1_score(y_train[test_idx], pred))
    precision.append(precision_score(y_train[test_idx], pred))
    recall.append(recall_score(y_train[test_idx], pred))
    
pd.DataFrame({
    "model": ["Stacking 3 meta-learner and Majority vote"],
    "acc (cv=10)": [np.mean(acc)],
    "f1 (cv=10)": [np.mean(f1)],
    "precision (cv=10)": [np.mean(precision)],
    "recall (cv=10)": [np.mean(recall)]
})

pred1 = meta_model1.predict(test_set_base1).reshape((-1, 1))
pred2 = meta_model2.predict(test_set_base2).reshape((-1, 1))
pred3 = meta_model3.predict(X_test).reshape((-1, 1))

preds = np.hstack([pred1, pred2, pred3])

pred = []

for row in preds:
    pred.append(Counter(row).most_common(1)[0][0])

pred = np.array(pred).reshape((-1, 1))

print("Accuracy on testset:", accuracy_score(y_test, pred))
print("F1 on testset:", f1_score(y_test, pred))
print("Precision on testset:", precision_score(y_test, pred))
print("Recall on testset:", recall_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Unnamed: 0,model,acc (cv=10),f1 (cv=10),precision (cv=10),recall (cv=10)
0,Stacking 3 meta-learner and Majority vote,0.814,0.837879,0.828316,0.861538


Accuracy on testset: 0.8360655737704918
F1 on testset: 0.8387096774193549
Precision on testset: 0.7647058823529411
Recall on testset: 0.9285714285714286
              precision    recall  f1-score   support

           0       0.93      0.76      0.83        33
           1       0.76      0.93      0.84        28

    accuracy                           0.84        61
   macro avg       0.85      0.84      0.84        61
weighted avg       0.85      0.84      0.84        61

[[25  8]
 [ 2 26]]
