In [110]:
import pandas as pd
import numpy as np
import pickle
import warnings

from IPython.core.interactiveshell import InteractiveShell
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = 'all'

In [111]:
cat_feat = pickle.load(open('./data/01_cat_feat.pkl', 'rb'))
num_feat = pickle.load(open('./data/01_num_feat.pkl', 'rb'))
comb_feat = pickle.load(open('./data/01_comb_feat.pkl', 'rb'))
target = pickle.load(open('./data/01_target.pkl', 'rb'))

# Baseline model

In [112]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

## validation

In [113]:
def validation(X_all, y_all, models):
    
    X_train, X_test, y_train, y_test = train_test_split(comb_feat, target, stratify=target, test_size=0.1, random_state=100)
    
    res = []
    
    for model_class in models:
        model = model_class()
        model_name = model.__class__.__name__
        
        cv_score = cross_val_score(model, X_all, y_all, cv=10)
        
        model = model_class()
        _ = model.fit(X_train, y_train)
        pred =model.predict(X_test)
        
        acc = accuracy_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        
        res.append({
            "model": model_name,
            "cv_score": cv_score.mean(),
            "acc": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall
        })
        
    return res

In [114]:
models = [SVC, XGBClassifier, LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier]

* combine features

In [115]:
pd.DataFrame(validation(comb_feat, target, models))

Unnamed: 0,model,cv_score,acc,f1,precision,recall
0,SVC,0.854624,0.903226,0.914286,0.888889,0.941176
1,XGBClassifier,0.788065,0.83871,0.864865,0.8,0.941176
2,LogisticRegression,0.844624,0.903226,0.914286,0.888889,0.941176
3,KNeighborsClassifier,0.811828,0.806452,0.823529,0.823529,0.823529
4,DecisionTreeClassifier,0.71871,0.741935,0.777778,0.736842,0.823529
5,RandomForestClassifier,0.831398,0.83871,0.864865,0.8,0.941176
6,GradientBoostingClassifier,0.811183,0.870968,0.888889,0.842105,0.941176


* categorical features

In [116]:
pd.DataFrame(validation(cat_feat, target, models))

Unnamed: 0,model,cv_score,acc,f1,precision,recall
0,SVC,0.851075,0.903226,0.914286,0.888889,0.941176
1,XGBClassifier,0.831183,0.83871,0.864865,0.8,0.941176
2,LogisticRegression,0.834409,0.903226,0.914286,0.888889,0.941176
3,KNeighborsClassifier,0.82828,0.806452,0.823529,0.823529,0.823529
4,DecisionTreeClassifier,0.775269,0.83871,0.864865,0.8,0.941176
5,RandomForestClassifier,0.83828,0.806452,0.842105,0.761905,0.941176
6,GradientBoostingClassifier,0.827957,0.870968,0.888889,0.842105,0.941176


* numeric features

In [117]:
pd.DataFrame(validation(num_feat, target, models))

Unnamed: 0,model,cv_score,acc,f1,precision,recall
0,SVC,0.735699,0.903226,0.914286,0.888889,0.941176
1,XGBClassifier,0.646237,0.83871,0.864865,0.8,0.941176
2,LogisticRegression,0.729677,0.903226,0.914286,0.888889,0.941176
3,KNeighborsClassifier,0.669892,0.806452,0.823529,0.823529,0.823529
4,DecisionTreeClassifier,0.633871,0.709677,0.756757,0.7,0.823529
5,RandomForestClassifier,0.702903,0.83871,0.864865,0.8,0.941176
6,GradientBoostingClassifier,0.693333,0.870968,0.888889,0.842105,0.941176


# Stacking models

* simple stacking

In [118]:
def train(X, y, models):
    
    res_model = []
    
    for model_class in models:
        model = model_class()
        _ = model.fit(X_train, y_train)
        
        res_model.append(model)
        
    return res_model

In [119]:
X_train, X_test, y_train, y_test = train_test_split(comb_feat, target, stratify=target, test_size=0.1, random_state=100)

In [120]:
models = [SVC, LogisticRegression, GradientBoostingClassifier]

In [121]:
base_models = train(X_train, y_train, models)

In [122]:
def stack_output(X, models):
    res = []
    for model in models:
        pred = model.predict(X).reshape((-1, 1))
        res.append(pred)
    return np.hstack(res)

def stack_output_v2(X, models):
    res = []
    for model in models:
        pred = model.predict(X).reshape((-1, 1))
        res.append(pred)
    return np.hstack(res + [X])

In [123]:
train_out_base = stack_output(X_train, base_models)
train_out_base.shape

(272, 3)

In [124]:
meta_model = SVC()
_ = meta_model.fit(train_out_base, y_train)

In [125]:
test_out_base = stack_output(X_test, base_models)
test_out_base

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 0, 0],
       [1, 1, 1],
       [0, 0, 0]], dtype=int64)

In [126]:
meta_model.score(test_out_base, y_test)

0.8709677419354839

* majority vote

In [127]:
from collections import Counter

In [128]:
res = []

for row in test_out_base:
    res.append(Counter(row).most_common(1)[0][0])
    
res = np.array(res).reshape((-1, 1))
# res

In [129]:
accuracy_score(y_test, res)
print(classification_report(y_test, res))

0.9032258064516129

              precision    recall  f1-score   support

           0       0.92      0.86      0.89        14
           1       0.89      0.94      0.91        17

    accuracy                           0.90        31
   macro avg       0.91      0.90      0.90        31
weighted avg       0.90      0.90      0.90        31

