In [14]:
# prerequisite
# pip install -U imbalanced-learn

In [15]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split

from transformers.Rebalancer import Rebalancer

from utils.DataHelper import DataHelper
from utils.Common import Config
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import matplotlib.pyplot as plt


In [16]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [17]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [18]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [19]:
# pass feature to pipeline and convert it to numerical data
X = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)

In [20]:
X.isna().sum().sum()

0

In [21]:
Y.value_counts().tolist()

[14246, 2201]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [23]:
from imblearn.over_sampling import SMOTE
smote_minority = SMOTE(n_jobs = -1, sampling_strategy = "minority")

X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)




In [29]:

# Create param grid

from sklearn.model_selection import RandomizedSearchCV


param_grid = {
              'criterion': ['gini', 'entropy'], 
              'min_samples_split':  range(5,50, 5),
              'min_samples_leaf': range(1, 15, 3),
              'max_depth': range(5, 20, 1)
              } 

clf = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    scoring="accuracy",
    param_distributions=param_grid,
    cv=10,
    n_iter = 7,
    refit=True,
    verbose=3
)

best_clf = clf.fit(X_train_sm, y_train_sm)

best_clf.best_score_

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV 1/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.682 total time=   1.4s
[CV 2/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.808 total time=   1.3s
[CV 3/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.955 total time=   1.4s
[CV 4/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.957 total time=   1.4s
[CV 5/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.947 total time=   1.4s
[CV 6/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.961 total time=   1.3s
[CV 7/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, score=0.952 total time=   1.3s
[CV 8/10] END criterion=entropy, max_depth=13, min_samples_leaf=4, min_samples_split=30;, sc

0.91910818071946

In [None]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.9136778115501519

In [None]:
best_clf.best_params_
# (max_depth=14, min_samples_leaf=4, min_samples_split=25)

{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

In [32]:
import joblib

joblib.dump(best_clf.best_estimator_,'../models/best_model_random_forest.pkl')


['best_model_random_forest.pkl']