# KNN Classification - Data Leakage Perspective

In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris

# Problem Statement

Predict Patient is diabetic or Not.

# Data Gathering

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


## Train Test Split

In [5]:
# regular Model
x = df.drop('Outcome', axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

# EDA

# Feature Engineering

### Feature Scaling

# 1. Normalization (0 to 1)

In [10]:
# Instance
normal_scalar = MinMaxScaler()

array = normal_scalar.fit_transform(x_train)

x_train_normal_df = pd.DataFrame(array, columns=x_train.columns)
x_train_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.608040,0.472727,0.000000,0.000000,0.536513,0.018408,0.066667
1,0.884422,0.818182,0.343434,0.441176,0.502235,0.163955,0.616667
2,0.562814,0.727273,0.454545,0.194118,0.518629,0.056935,0.050000
3,0.562814,0.600000,0.000000,0.000000,0.563338,0.075771,0.333333
4,0.417085,0.781818,0.191919,0.000000,0.436662,0.099743,0.216667
...,...,...,...,...,...,...,...
609,0.683417,0.636364,0.323232,0.161765,0.552906,0.029538,0.366667
610,0.467337,0.545455,0.252525,0.135294,0.427720,0.191781,0.016667
611,0.788945,0.672727,0.353535,0.647059,0.587183,0.021404,0.150000
612,0.643216,0.618182,0.191919,0.264706,0.454545,0.559503,0.066667


# 2. Standardization (-3 to +3)

In [25]:
std_scalar = StandardScaler()

array = std_scalar.fit_transform(x_train)

x_train_std_df = pd.DataFrame(array, columns=x_train.columns)
x_train_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.016101,-0.952795,-1.312783,-0.728229,0.496832,-1.047703,-0.700731
1,1.726607,1.122117,0.823777,2.032586,0.199293,-0.025033,2.134940
2,-0.301271,0.576088,1.515017,0.486530,0.341594,-0.776996,-0.786660
3,-0.301271,-0.188353,-1.312783,-0.728229,0.729688,-0.644651,0.674140
4,-1.220154,0.903705,-0.118823,-0.728229,-0.369911,-0.476211,0.072634
...,...,...,...,...,...,...,...
609,0.459183,0.030058,0.698097,0.284070,0.639133,-0.969499,0.845999
610,-0.903298,-0.515971,0.258217,0.118421,-0.447530,0.170478,-0.958519
611,1.124581,0.248470,0.886617,3.320966,0.936671,-1.026648,-0.271083
612,0.205698,-0.079148,-0.118823,0.928260,-0.214674,2.754225,-0.700731


# Model Building

### Model training

In [26]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train_std_df, y_train)

### Model Evalution

In [27]:
array = std_scalar.transform(x_test)

x_test_std_df = pd.DataFrame(array, columns=x_test.columns)
x_test_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.713184,0.139264,-0.244503,-0.728229,-0.848560,-0.545392,-0.442942
1,0.680982,0.248470,0.069697,-0.166863,-0.770941,-0.659690,-1.044448
2,-1.030040,0.357676,0.823777,-0.387728,-0.124118,-0.852193,-0.872589
3,-0.079472,-3.792147,-1.312783,-0.728229,-0.900306,-0.801059,0.330422
4,0.142327,0.466882,0.383897,-0.525769,-0.331102,-0.109253,0.588211
...,...,...,...,...,...,...,...
149,-0.523071,0.139264,0.509577,2.262654,0.613260,-0.951452,-0.442942
150,0.680982,0.466882,-1.312783,-0.728229,1.661113,-0.858209,1.189716
151,-0.491385,0.030058,0.446737,0.514138,0.263975,-1.002586,-0.958519
152,1.568179,0.139264,0.760937,0.514138,0.147547,-0.831138,-0.786660


In [28]:
# Testing Data

y_pred = knn_clf.predict(x_test_std_df)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[84 16]
 [23 31]]
************************************************************
Accuracy is --> 0.7467532467532467
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.79      0.84      0.81       100
           1       0.66      0.57      0.61        54

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.75      0.74       154



In [29]:
# Training Data

y_pred_train = knn_clf.predict(x_train_std_df)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[352  48]
 [ 68 146]]
************************************************************
Accuracy is --> 0.8110749185667753
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       400
           1       0.75      0.68      0.72       214

    accuracy                           0.81       614
   macro avg       0.80      0.78      0.79       614
weighted avg       0.81      0.81      0.81       614



# Hyperparameter Tunning

In [30]:
knn_clf = KNeighborsClassifier() # Estimator

param_grid = {"n_neighbors" : np.arange(3,30),
                  "p": [1,2]} # Param_grid

gscv_knn_clf = GridSearchCV(knn_clf, param_grid, cv=5)  # cv=5

gscv_knn_clf.fit(x_train_std_df, y_train) 
gscv_knn_clf.best_estimator_


In [65]:
(gscv_knn_clf.best_params_)

{'n_neighbors': 21, 'p': 2}

In [31]:
# Testing Accuracy

knn_clf = gscv_knn_clf.best_estimator_

y_pred = knn_clf.predict(x_test_std_df)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[86 14]
 [30 24]]
************************************************************
Accuracy is --> 0.7142857142857143
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.74      0.86      0.80       100
           1       0.63      0.44      0.52        54

    accuracy                           0.71       154
   macro avg       0.69      0.65      0.66       154
weighted avg       0.70      0.71      0.70       154



In [32]:
# Training Data

y_pred_train = knn_clf.predict(x_train_std_df)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[370  30]
 [100 114]]
************************************************************
Accuracy is --> 0.7882736156351792
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.79      0.93      0.85       400
           1       0.79      0.53      0.64       214

    accuracy                           0.79       614
   macro avg       0.79      0.73      0.74       614
weighted avg       0.79      0.79      0.78       614

