In [13]:
import matplotlib.pyplot as plt
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from utils.Common import Config

from sklearn.metrics import classification_report

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [14]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]
# pass feature to pipeline and convert it to numerical data
X = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y, random_state=42)
smote_minority = SMOTE(sampling_strategy = "minority", random_state=42)
X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [17]:
clfs = [
    #DecisionTreeClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    #KNeighborsClassifier(),
    #LogisticRegression(random_state=42),
    #GaussianNB(),
    RandomForestClassifier(random_state=42, n_estimators=200),
    StackingClassifier(estimators= [
        ('lg', LogisticRegression(random_state=42, solver='newton-cg')), 
        ('nb', GaussianNB()), 
        ('dc', LogisticRegression(random_state=42))
        ], 
         final_estimator=LogisticRegression(random_state=42)
    ),
    VotingClassifier(estimators= [
        ('rf', RandomForestClassifier(random_state=42, n_estimators=200)), 
        ('svc', SVC(random_state=42)), 
        ('knn', KNeighborsClassifier())
        ]
    , voting='hard'),
    SVC(random_state=42),
    XGBClassifier(learning_rate=0.01, n_estimators=600, objective='binary:logistic',
                 random_state = 42)
]

for clf in clfs:
    clf.fit(X_train_sm, y_train_sm)
    y_train_sm_pred = clf.predict(X_train_sm)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    print(f'================={clf.__class__.__name__}=================')
    print(classification_report(y_train,y_train_pred))
    print(classification_report(y_train_sm,y_train_sm_pred))
    print(classification_report(y_test,y_test_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11396
           1       1.00      1.00      1.00      1761

    accuracy                           1.00     13157
   macro avg       1.00      1.00      1.00     13157
weighted avg       1.00      1.00      1.00     13157

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11396
           1       1.00      1.00      1.00     11396

    accuracy                           1.00     22792
   macro avg       1.00      1.00      1.00     22792
weighted avg       1.00      1.00      1.00     22792

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      2850
           1       0.81      0.26      0.40       440

    accuracy                           0.89      3290
   macro avg       0.85      0.63      0.67      3290
weighted avg       0.88      0.89      0.87      3290



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              precision    recall  f1-score   support

           0       0.93      0.68      0.79     11396
           1       0.25      0.68      0.36      1761

    accuracy                           0.68     13157
   macro avg       0.59      0.68      0.58     13157
weighted avg       0.84      0.68      0.73     13157

              precision    recall  f1-score   support

           0       0.69      0.68      0.69     11396
           1       0.69      0.70      0.69     11396

    accuracy                           0.69     22792
   macro avg       0.69      0.69      0.69     22792
weighted avg       0.69      0.69      0.69     22792

              precision    recall  f1-score   support

           0       0.92      0.68      0.78      2850
           1       0.23      0.64      0.34       440

    accuracy                           0.67      3290
   macro avg       0.58      0.66      0.56      3290
weighted avg       0.83      0.67      0.72      3290

              preci