In [59]:
# prerequisite
# pip install -U imbalanced-learn

In [60]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from utils.Common import Config
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
import joblib

In [61]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [62]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [63]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [65]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [67]:
X.isna().sum().sum()

0

In [68]:
Y.value_counts().tolist()

[14246, 2201]

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [70]:
smote_minority = SMOTE(n_jobs = -1, sampling_strategy = "minority")
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)




In [71]:


rf_clf = joblib.load('../models/best_model_random_forest.pkl')
svc_clf = joblib.load('../models/best_model_svc.pkl')
knn_clf = joblib.load('../models/best_model_knn.pkl')


In [72]:
rf_clf.fit(X_train_sm, y_train_sm)

In [73]:
rf_clf.score(X_train_sm, y_train_sm)

0.9504212004212004

In [74]:
rf_clf.score(X_test,y_test)

0.8641337386018237

In [75]:
svc_clf.fit(X_train_sm, y_train_sm)


In [76]:
svc_clf.score(X_train_sm, y_train_sm)

0.9999122499122499

In [77]:
svc_clf.score(X_test,y_test)

0.9097264437689969

In [78]:
knn_clf.fit(X_train_sm, y_train_sm)

In [79]:
knn_clf.score(X_train_sm, y_train_sm)

1.0

In [80]:
knn_clf.score(X_test,y_test)

0.896048632218845

In [81]:

vt_hard_clf = VotingClassifier(estimators= [
         ('rf', rf_clf), 
         ('svc', svc_clf), 
         ('knn', knn_clf)
         ]
, voting='hard')

In [82]:
vt_hard_clf.fit(X_train_sm, y_train_sm)

In [83]:
vt_hard_clf.score(X_train_sm, y_train_sm)

1.0

In [84]:
vt_hard_clf.score(X_test,y_test)

0.9179331306990881

In [85]:
vt_soft_clf = VotingClassifier(estimators= [
         ('rf', rf_clf), 
         ('svc', svc_clf), 
         ('knn', knn_clf)
         ]
, voting='soft')
vt_soft_clf.named_estimators['svc'].probability=True

In [86]:
vt_soft_clf.fit(X_train_sm, y_train_sm)

In [87]:
vt_soft_clf.score(X_train_sm, y_train_sm)

1.0

In [88]:
vt_soft_clf.score(X_test,y_test)

0.9267477203647416

In [89]:
from sklearn.ensemble import RandomForestClassifier


X_val_predictions = np.empty((len(X_train_sm), len(vt_soft_clf.estimators_)), dtype=np.float32)
for index, estimator in enumerate(vt_soft_clf.estimators_):
    X_val_predictions[:, index] = estimator.predict(X_train_sm)
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_train_sm)



In [90]:
rnd_forest_blender.oob_score_

1.0

In [91]:
import joblib

joblib.dump(vt_hard_clf,'../models/best_model_voting_hard.pkl')
joblib.dump(vt_soft_clf,'../models/best_model_voting_soft.pkl')


['../models/best_model_voting_soft.pkl']