In [16]:
# prerequisite
# pip install -U imbalanced-learn

In [17]:
import matplotlib.pyplot as plt
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from utils.Common import Config

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [18]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [19]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [20]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [21]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [22]:
X.isna().sum().sum()

0

In [23]:
Y.value_counts().tolist()

[14246, 2201]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y, random_state=42)

In [25]:
smote_minority = SMOTE(sampling_strategy = "minority", random_state=42)
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [26]:

# Create param grid



param_grid = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.5, 1, 5),
    #"min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["sqrt"],
    "criterion": ["friedman_mse",  "squared_error"],
    "subsample":[0.8],
    "n_estimators":[10]
    }

clf = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),param_grid=param_grid,n_jobs=10,cv=10)
best_clf = clf.fit(X_train_sm, y_train_sm)
best_clf.best_estimator_
best_clf.best_score_


0.720392311955844

In [13]:
best_model = best_clf.best_estimator_


0.6735562310030395

In [29]:
best_model.score(X_train, y_train)

0.7148286083453674

In [30]:
best_model.score(X_train_sm, y_train_sm)

0.7483766233766234

In [31]:
best_model.score(X_test, y_test)

0.7042553191489361

In [28]:
from sklearn.metrics import classification_report

y_train_pred = best_model.predict(X_train)
y_train_sm_pred = best_model.predict(X_train_sm)
y_test_pred = best_model.predict(X_test)

print(classification_report(y_train,y_train_pred))
print(classification_report(y_train_sm,y_train_sm_pred))
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82     11396
           1       0.22      0.45      0.30      1761

    accuracy                           0.71     13157
   macro avg       0.56      0.60      0.56     13157
weighted avg       0.81      0.71      0.75     13157

              precision    recall  f1-score   support

           0       0.74      0.76      0.75     11396
           1       0.75      0.74      0.75     11396

    accuracy                           0.75     22792
   macro avg       0.75      0.75      0.75     22792
weighted avg       0.75      0.75      0.75     22792

              precision    recall  f1-score   support

           0       0.89      0.75      0.81      2850
           1       0.20      0.40      0.26       440

    accuracy                           0.70      3290
   macro avg       0.54      0.57      0.54      3290
weighted avg       0.80      0.70      0.74      3290



In [14]:
best_clf.best_params_
#{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_split': 0.5, 'n_estimators': 10, 'subsample': 0.8}

{'criterion': 'friedman_mse',
 'learning_rate': 0.2,
 'max_depth': 8,
 'max_features': 'sqrt',
 'min_samples_split': 0.5,
 'n_estimators': 10,
 'subsample': 0.8}

In [27]:
import joblib

best_model = GradientBoostingClassifier(criterion='friedman_mse',  learning_rate=0.2,  max_depth=8,  max_features='sqrt', min_samples_split= 0.5,  n_estimators=10,  subsample=0.8)
best_model.fit(X_train_sm, y_train_sm)

joblib.dump(best_model,'../models/best_model_gradientboost.pkl')


['../models/best_model_gradientboost.pkl']