In [1]:
# prerequisite
# pip install -U imbalanced-learn

In [2]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from utils.Common import Config
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
import joblib

In [3]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [4]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [5]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [6]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [7]:
X.isna().sum().sum()

0

In [8]:
Y.value_counts().tolist()

[14246, 2201]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [10]:
smote_minority = SMOTE(n_jobs = -1, sampling_strategy = "minority")
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)




In [11]:
lg_clf = joblib.load('../models/best_model_logistic_regression.pkl')
nb_clf = joblib.load('../models/best_model_naivebayes.pkl')
dc_clf = joblib.load('../models/best_model_decision_tree.pkl')


In [12]:

from sklearn.ensemble import RandomForestClassifier


sk_hard_clf = StackingClassifier(estimators= [
         ('lg', lg_clf), 
         ('nb', nb_clf), 
         ('dc', dc_clf)
         ]
)

In [13]:
sk_hard_clf.fit(X_train_sm, y_train_sm)

In [14]:
sk_hard_clf.score(X_train_sm, y_train_sm)

0.9026851526851527

In [15]:
sk_hard_clf.score(X_test,y_test)

0.8209726443768997

In [16]:
import joblib

joblib.dump(sk_hard_clf,'../models/best_model_stacking.pkl')


['../models/best_model_stacking.pkl']