In [147]:
# prerequisite
# pip install -U imbalanced-learn

In [148]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from utils.Common import Config
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
import joblib

In [149]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [150]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [151]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [152]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [153]:
X.isna().sum().sum()

0

In [154]:
Y.value_counts().tolist()

[14246, 2201]

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y, random_state=42)

In [156]:
smote_minority = SMOTE(sampling_strategy = "minority", random_state=42)
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [157]:
lg_clf = joblib.load('../models/best_model_logistic_regression.pkl')
nb_clf = joblib.load('../models/best_model_naivebayes.pkl')
dc_clf = joblib.load('../models/best_model_decision_tree.pkl')


In [158]:
lg_clf.random_state=42
nb_clf.random_state=42
dc_clf.random_state=42

In [159]:
sk_clf = StackingClassifier(estimators= [
         ('lg', lg_clf), 
         ('nb', nb_clf), 
         ('dc', dc_clf)
         ], 
          final_estimator=LogisticRegression(random_state=42)
)

In [160]:
sk_clf.fit(X_train_sm, y_train_sm)

In [161]:
sk_clf.score(X_train,y_train)

0.8855362164627195

In [162]:
sk_clf.score(X_train_sm,y_train_sm)

0.9135661635661636

In [163]:
sk_clf.score(X_test,y_test)

0.8303951367781155

In [164]:
from sklearn.metrics import classification_report

y_train_pred = sk_clf.predict(X_train)
y_train_sm_pred = sk_clf.predict(X_train_sm)
y_test_pred = sk_clf.predict(X_test)

print(classification_report(y_train,y_train_pred))
print(classification_report(y_train_sm,y_train_sm_pred))
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93     11396
           1       0.56      0.70      0.62      1761

    accuracy                           0.89     13157
   macro avg       0.75      0.81      0.78     13157
weighted avg       0.90      0.89      0.89     13157

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     11396
           1       0.91      0.91      0.91     11396

    accuracy                           0.91     22792
   macro avg       0.91      0.91      0.91     22792
weighted avg       0.91      0.91      0.91     22792

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2850
           1       0.39      0.49      0.44       440

    accuracy                           0.83      3290
   macro avg       0.66      0.69      0.67      3290
weighted avg       0.85      0.83      0.84      3290



In [165]:
import joblib

joblib.dump(sk_hard_clf,'../models/best_model_stacking.pkl')


['../models/best_model_stacking.pkl']