In [2]:
# prerequisite
# pip install -U imbalanced-learn

In [3]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split

from utils.Common import Config
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

import pandas as pd
import numpy as np


In [4]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [5]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [6]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [7]:
# pass feature to pipeline and convert it to numerical data
X = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)

In [8]:
X.isna().sum().sum()

0

In [9]:
Y.value_counts().tolist()

[14246, 2201]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [11]:
from imblearn.over_sampling import SMOTE
smote_minority = SMOTE(n_jobs = -1, sampling_strategy = "minority")

X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)




In [11]:

# Create param grid
param_grid = {
              'n_neighbors': [5,7,9,11,13,15],
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
               'metric' : ['minkowski','euclidean','manhattan']
            } 

clf = GridSearchCV(
    estimator=KNeighborsClassifier(),
    scoring="accuracy",
    param_grid=param_grid,
    cv=10,
    refit=True,
    verbose=3
)

best_clf = clf.fit(X_train_sm, y_train_sm)

best_clf.best_score_

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
[CV 1/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.881 total time=   0.0s
[CV 2/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.899 total time=   0.1s
[CV 3/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.900 total time=   0.0s
[CV 4/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.885 total time=   0.0s
[CV 5/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.894 total time=   0.0s
[CV 6/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.886 total time=   0.0s
[CV 7/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.881 total time=   0.1s
[CV 8/10] END algorithm=auto, metric=minkowski, n_neighbors=5, weights=uniform;, score=0.881 total time=   0.1s
[CV 9/10] END algorithm=auto, metric=mi

0.9529216992679151

In [12]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.8969604863221885

In [13]:
best_clf.best_params_
# {'algorithm': 'auto',  'metric': 'manhattan',  'n_neighbors': 5,  'weights': 'distance'}


{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 5,
 'weights': 'distance'}

In [12]:
import joblib
from sklearn.neighbors import KNeighborsClassifier

best_model = KNeighborsClassifier(algorithm= 'auto',  metric= 'manhattan',  n_neighbors= 5,  weights= 'distance')
best_model.fit(X_train_sm, y_train_sm)

joblib.dump(best_model,'../models/best_model_knn.pkl')


['../models/best_model_knn.pkl']