In [32]:
# prerequisite
# pip install -U imbalanced-learn

In [33]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from utils.Common import Config
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
import joblib

In [34]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [35]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [36]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [37]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs,Config.cat_attribs)
X = dp.process(X)

In [38]:
X.isna().sum().sum()

0

In [39]:
Y.value_counts().tolist()

[14246, 2201]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [41]:
smote_minority = SMOTE(n_jobs = -1, sampling_strategy = "minority")
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)




In [42]:

# Create param grid

from sklearn.model_selection import RandomizedSearchCV


param_grid = {
              'criterion': ['gini', 'entropy'], 
              'min_samples_split':  range(5,50, 5),
              'min_samples_leaf': range(1, 15, 3),
              'max_depth': range(5, 20, 1),
              'splitter' : ['best','random']
              } 

clf = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(),
    scoring="roc_auc",
    param_distributions=param_grid,
    cv=10,
    n_iter = 7,
    refit=True,
    verbose=3
)

best_clf = clf.fit(X_train_sm, y_train_sm)

best_clf.best_score_

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV 1/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.811 total time=   0.0s
[CV 2/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.875 total time=   0.0s
[CV 3/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.941 total time=   0.0s
[CV 4/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.934 total time=   0.0s
[CV 5/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.929 total time=   0.0s
[CV 6/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.924 total time=   0.0s
[CV 7/10] END criterion=gini, max_depth=17, min_samples_leaf=13, min_samples_split=40, splitter=random;, score=0.923 total 

0.9442224420507858

In [43]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.8434650455927052

In [44]:
best_clf.best_params_
# {'splitter': 'best', 'min_samples_split': 15, 'min_samples_leaf': 10, 'max_depth': 18, 'criterion': 'entropy'}

{'splitter': 'best',
 'min_samples_split': 15,
 'min_samples_leaf': 10,
 'max_depth': 18,
 'criterion': 'entropy'}

In [45]:
import joblib

best_model = DecisionTreeClassifier(splitter = 'best'
                                    , min_samples_split = 15
                                    , min_samples_leaf = 10
                                    , max_depth = 18
                                    , criterion = 'entropy'
                                    )
best_model.fit(X_train_sm, y_train_sm)

joblib.dump(best_model,'../models/best_model_decision_tree.pkl')


['../models/best_model_decision_tree.pkl']