# KNN Classification - Data Leakage Perspective

In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris

# Problem Statement

Predict Patient is diabetic or Not.

# Data Gathering

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


## Train Test Split

In [5]:
# regular Model
x = df.drop('Outcome', axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

# EDA

# Feature Engineering

### Feature Scaling

# 1. Normalization (0 to 1)

In [10]:
# Instance
normal_scalar = MinMaxScaler()

array = normal_scalar.fit_transform(x_train)

x_train_normal_df = pd.DataFrame(array, columns=x_train.columns)
x_train_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.608040,0.472727,0.000000,0.000000,0.536513,0.018408,0.066667
1,0.884422,0.818182,0.343434,0.441176,0.502235,0.163955,0.616667
2,0.562814,0.727273,0.454545,0.194118,0.518629,0.056935,0.050000
3,0.562814,0.600000,0.000000,0.000000,0.563338,0.075771,0.333333
4,0.417085,0.781818,0.191919,0.000000,0.436662,0.099743,0.216667
...,...,...,...,...,...,...,...
609,0.683417,0.636364,0.323232,0.161765,0.552906,0.029538,0.366667
610,0.467337,0.545455,0.252525,0.135294,0.427720,0.191781,0.016667
611,0.788945,0.672727,0.353535,0.647059,0.587183,0.021404,0.150000
612,0.643216,0.618182,0.191919,0.264706,0.454545,0.559503,0.066667


# Model Building

### Model training

In [11]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train_normal_df, y_train)

### Model Evalution

In [15]:
array = normal_scalar.transform(x_test)

x_test_normal_df = pd.DataFrame(array, columns=x_test.columns)
x_test_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.497487,0.654545,0.171717,0.000000,0.381520,0.089897,0.116667
1,0.718593,0.672727,0.222222,0.089706,0.390462,0.073630,0.000000
2,0.447236,0.690909,0.343434,0.054412,0.464978,0.046233,0.033333
3,0.597990,0.000000,0.000000,0.000000,0.375559,0.053510,0.266667
4,0.633166,0.709091,0.272727,0.032353,0.441133,0.151969,0.316667
...,...,...,...,...,...,...,...
149,0.527638,0.654545,0.292929,0.477941,0.549925,0.032106,0.116667
150,0.718593,0.709091,0.000000,0.000000,0.670641,0.045377,0.433333
151,0.532663,0.636364,0.282828,0.198529,0.509687,0.024829,0.016667
152,0.859296,0.654545,0.333333,0.198529,0.496274,0.049229,0.050000


In [16]:
# Testing Data

y_pred = knn_clf.predict(x_test_normal_df)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[83 17]
 [21 33]]
************************************************************
Accuracy is --> 0.7532467532467533
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.80      0.83      0.81       100
           1       0.66      0.61      0.63        54

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154



In [17]:
# Training Data

y_pred_train = knn_clf.predict(x_train_normal_df)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[357  43]
 [ 67 147]]
************************************************************
Accuracy is --> 0.8208469055374593
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.84      0.89      0.87       400
           1       0.77      0.69      0.73       214

    accuracy                           0.82       614
   macro avg       0.81      0.79      0.80       614
weighted avg       0.82      0.82      0.82       614



# Hyperparameter Tunning

In [18]:
knn_clf = KNeighborsClassifier() # Estimator

param_grid = {"n_neighbors" : np.arange(3,30),
                  "p": [1,2]} # Param_grid

gscv_knn_clf = GridSearchCV(knn_clf, param_grid, cv=5)  # cv=5

gscv_knn_clf.fit(x_train_normal_df, y_train) 
gscv_knn_clf.best_estimator_

model_details.append("Hyperparameter Tuning")

ValueError: Invalid parameter 'cv' for estimator KNeighborsClassifier(). Valid parameters are: ['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'].

In [65]:
(gscv_knn_clf.best_params_)

{'n_neighbors': 21, 'p': 2}

In [97]:
# Testing Accuracy

knn_clf = gscv_knn_clf.best_estimator_

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

testing_accuract_list.append(accurracy)

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[84 16]
 [28 26]]
************************************************************
Accuracy is --> 0.7142857142857143
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.75      0.84      0.79       100
           1       0.62      0.48      0.54        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154



In [98]:
# Training Data

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

training_accuracy_list.append(accurracy)

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[364  36]
 [ 86 128]]
************************************************************
Accuracy is --> 0.8013029315960912
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       400
           1       0.78      0.60      0.68       214

    accuracy                           0.80       614
   macro avg       0.79      0.75      0.77       614
weighted avg       0.80      0.80      0.79       614

