In [1]:
# prerequisite
# pip install -U imbalanced-learn

In [2]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from utils.Common import Config

import pandas as pd
import numpy as np
import joblib

In [3]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [4]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [5]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [6]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [7]:
X.isna().sum().sum()

0

In [8]:
Y.value_counts().tolist()

[14246, 2201]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y, random_state=42)

In [10]:
smote_minority = SMOTE(sampling_strategy = "minority", random_state=42)
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [11]:

# Create param grid

param_grid = {
              'criterion': ['gini', 'entropy'], 
              'min_samples_split':  range(5,50, 5),
              'min_samples_leaf': range(1, 15, 3),
              'max_depth': range(5, 20, 1),
              'splitter' : ['best','random']
              } 

clf = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    scoring="roc_auc",
    param_distributions=param_grid,
    cv=10,
    n_iter = 7,
    refit=True,
    verbose=3,
    random_state=42
)

best_clf = clf.fit(X_train_sm, y_train_sm)

best_clf.best_score_

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV 1/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.827 total time=   0.1s
[CV 2/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.883 total time=   0.1s
[CV 3/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.952 total time=   0.1s
[CV 4/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.952 total time=   0.1s
[CV 5/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.954 total time=   0.1s
[CV 6/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.953 total time=   0.1s
[CV 7/10] END criterion=gini, max_depth=14, min_samples_leaf=7, min_samples_split=40, splitter=best;, score=0.952 total time=   0.1s
[CV 8/10

0.9442184695986688

In [12]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.8443768996960487

In [13]:
from sklearn.metrics import classification_report

y_train_pred = best_model.predict(X_train)
y_train_sm_pred = best_model.predict(X_train_sm)
y_test_pred = best_model.predict(X_test)

print(classification_report(y_train,y_train_pred))
print(classification_report(y_train_sm,y_train_sm_pred))
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94     11396
           1       0.60      0.63      0.61      1761

    accuracy                           0.89     13157
   macro avg       0.77      0.78      0.78     13157
weighted avg       0.90      0.89      0.90     13157

              precision    recall  f1-score   support

           0       0.90      0.94      0.92     11396
           1       0.93      0.89      0.91     11396

    accuracy                           0.92     22792
   macro avg       0.92      0.92      0.92     22792
weighted avg       0.92      0.92      0.92     22792

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2850
           1       0.42      0.43      0.42       440

    accuracy                           0.84      3290
   macro avg       0.67      0.67      0.67      3290
weighted avg       0.85      0.84      0.85      3290



In [14]:
best_clf.best_params_
# {'splitter': 'best', 'min_samples_split': 45, 'min_samples_leaf': 4, 'max_depth': 19, 'criterion': 'gini'}

{'splitter': 'best',
 'min_samples_split': 45,
 'min_samples_leaf': 4,
 'max_depth': 19,
 'criterion': 'gini'}

In [15]:
import joblib

best_model = DecisionTreeClassifier(splitter = 'best'
                                    , min_samples_split = 45
                                    , min_samples_leaf = 4
                                    , max_depth = 19
                                    , criterion = 'gini'
                                    )
best_model.fit(X_train_sm, y_train_sm)

joblib.dump(best_model,'../models/best_model_decision_tree.pkl')


['../models/best_model_decision_tree.pkl']