In [105]:
# prerequisite
# pip install -U imbalanced-learn

In [106]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from utils.Common import Config

import pandas as pd
import numpy as np
import joblib


In [107]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [108]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [109]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [110]:
# pass feature to pipeline and convert it to numerical data
X = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)

In [111]:
X.isna().sum().sum()

0

In [112]:
Y.value_counts().tolist()

[14246, 2201]

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y, random_state=42)

In [114]:
smote_minority = SMOTE(sampling_strategy = "minority", random_state=42)
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [115]:
# pip install xgboost

In [116]:

clf = XGBClassifier(learning_rate=0.01, n_estimators=600, objective='binary:logistic',
                 random_state = 42)
clf.fit(X_train_sm, y_train_sm)

In [117]:
clf.score(X_train,y_train)

0.8932127384662157

In [118]:
clf.score(X_train_sm, y_train_sm)

0.9282643032643033

In [119]:
clf.score(X_test,y_test)

0.8717325227963526

In [120]:
# Create param grid
param_grid = {
        'gamma': [0.5, 2.5, 5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6,1.0],
        'max_depth': [3, 5]
        }

clf = GridSearchCV(
    estimator=XGBClassifier(learning_rate=0.01, n_estimators=600, objective='binary:logistic', random_state = 42
                 ),
    scoring="accuracy",
    param_grid=param_grid,
    cv=10,
    refit=True,
    verbose=3
)

best_clf = clf.fit(X_train_sm, y_train_sm)

best_clf.best_score_

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV 1/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.563 total time=   2.6s
[CV 2/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.738 total time=   2.6s
[CV 3/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.932 total time=   2.6s
[CV 4/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.927 total time=   2.6s
[CV 5/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.937 total time=   2.6s
[CV 6/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.935 total time=   2.7s
[CV 7/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.936 total time=   2.7s
[CV 8/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score=0.928 total time=   2.7s
[CV 9/10] END colsample_bytree=0.6, gamma=0.5, max_depth=3, subsample=0.6;, score

0.9084549433038498

In [121]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.8683890577507599

In [122]:
best_model.score(X_train,y_train)

0.8841681234323934

In [125]:
from sklearn.metrics import classification_report

y_train_pred = best_model.predict(X_train)
y_train_sm_pred = best_model.predict(X_train_sm)
y_test_pred = best_model.predict(X_test)

print(classification_report(y_train,y_train_pred))
print(classification_report(y_train_sm,y_train_sm_pred))
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94     11396
           1       0.68      0.25      0.36      1761

    accuracy                           0.88     13157
   macro avg       0.79      0.61      0.65     13157
weighted avg       0.87      0.88      0.86     13157

              precision    recall  f1-score   support

           0       0.87      0.98      0.93     11396
           1       0.98      0.86      0.92     11396

    accuracy                           0.92     22792
   macro avg       0.93      0.92      0.92     22792
weighted avg       0.93      0.92      0.92     22792

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      2850
           1       0.49      0.15      0.23       440

    accuracy                           0.87      3290
   macro avg       0.69      0.56      0.58      3290
weighted avg       0.83      0.87      0.83      3290



In [123]:
best_clf.best_params_
# {'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}

{'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 5, 'subsample': 0.6}

In [124]:
import joblib

best_model = XGBClassifier(learning_rate=0.01, n_estimators=600, objective='binary:logistic',colsample_bytree= 1.0, gamma= 0.5, max_depth= 5, min_child_weight= 1, subsample= 0.6)
best_model.fit(X_train_sm, y_train_sm)
joblib.dump(best_model,'../models/best_model_xgboost.pkl')


['../models/best_model_xgboost.pkl']