# KNN Algorithm - Classification

In [70]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris

In [71]:
model_details = []
testing_accuract_list = []
training_accuracy_list = []

# Problem Statement

Predict Patient is diabetic or Not.

# Data Gathering

In [72]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


# EDA

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Glucose                   768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   Insulin                   768 non-null    int64  
 4   BMI                       768 non-null    float64
 5   DiabetesPedigreeFunction  768 non-null    float64
 6   Age                       768 non-null    int64  
 7   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 48.1 KB


In [74]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Feature Engineering

### Feature Scaling

# 1. Normalization (0 to 1)

In [82]:
x_df = df.drop('Outcome', axis = 1)

x_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33


In [83]:
normal_scalar = MinMaxScaler()

array = normal_scalar.fit_transform(x_df)

x_normal_df = pd.DataFrame(array, columns=x_df.columns)
x_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


In [47]:
x_85 = (85 - 0)/(199 - 0)
x_85

0.4271356783919598

# 2. Standardization (-3 to +3)

In [91]:
std_scalar = StandardScaler()

array = std_scalar.fit_transform(x_df)

x_std_df = pd.DataFrame(array, columns=x_df.columns)
x_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715


# Model Building

### Train Test Split

In [75]:
# regular Model
x = df.drop('Outcome', axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

model_details.append("Regular Model")

In [84]:
# For Normalization
x = x_normal_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

model_details.append("Normalization Model")

In [92]:
# For Standardization
x = x_std_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

model_details.append("Standardization Model")

### Model training

In [93]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)

### Model Evalution

In [94]:
# Testing Data

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

testing_accuract_list.append(accurracy) # Testing Accuracy Appended

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[84 16]
 [24 30]]
************************************************************
Accuracy is --> 0.7402597402597403
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65      0.56      0.60        54

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154



In [95]:
# Training Data

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

training_accuracy_list.append(accurracy) # Training accuracy appended

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[353  47]
 [ 66 148]]
************************************************************
Accuracy is --> 0.8159609120521173
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       400
           1       0.76      0.69      0.72       214

    accuracy                           0.82       614
   macro avg       0.80      0.79      0.79       614
weighted avg       0.81      0.82      0.81       614



# Hyperparameter Tunning

In [96]:
knn_clf = KNeighborsClassifier()

param_grid = {"n_neighbors" : np.arange(3,30),
                  "p": [1,2]} # Param_grid

gscv_knn_clf = GridSearchCV(knn_clf, param_grid, cv=5)  # cv=5
gscv_knn_clf.fit(x_train, y_train) 
gscv_knn_clf.best_estimator_

model_details.append("Hyperparameter Tuning")

In [65]:
(gscv_knn_clf.best_params_)

{'n_neighbors': 21, 'p': 2}

In [97]:
# Testing Accuracy

knn_clf = gscv_knn_clf.best_estimator_

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_test, y_pred)
print("Accuracy is -->", accurracy)
print("*"*60)

testing_accuract_list.append(accurracy)

clf_report = classification_report(y_test, y_pred)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[84 16]
 [28 26]]
************************************************************
Accuracy is --> 0.7142857142857143
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.75      0.84      0.79       100
           1       0.62      0.48      0.54        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154



In [98]:
# Training Data

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix -->\n", cnf_matrix)
print("*"*60)

accurracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is -->", accurracy)
print("*"*60)

training_accuracy_list.append(accurracy)

clf_report = classification_report(y_train, y_pred_train)
print("Classification report -->\n",clf_report)

Confusion Matrix -->
 [[364  36]
 [ 86 128]]
************************************************************
Accuracy is --> 0.8013029315960912
************************************************************
Classification report -->
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       400
           1       0.78      0.60      0.68       214

    accuracy                           0.80       614
   macro avg       0.79      0.75      0.77       614
weighted avg       0.80      0.80      0.79       614



# Overall Comparison

In [99]:
comparison_df = pd.DataFrame({"Models": model_details, "Testing Accuracy": testing_accuract_list, "Training Accuracy": training_accuracy_list})
comparison_df

Unnamed: 0,Models,Testing Accuracy,Training Accuracy
0,Regular Model,0.701299,0.809446
1,Hyperparameter Tuning,0.74026,0.781759
2,Normalization Model,0.733766,0.819218
3,Hyperparameter Tuning,0.733766,0.791531
4,Standardization Model,0.74026,0.815961
5,Hyperparameter Tuning,0.714286,0.801303


In [102]:
comparison_df["Difference_in_%"] = (comparison_df['Training Accuracy'] - comparison_df['Testing Accuracy']) *100
comparison_df

Unnamed: 0,Models,Testing Accuracy,Training Accuracy,Difference_in_%
0,Regular Model,0.701299,0.809446,10.814755
1,Hyperparameter Tuning,0.74026,0.781759,4.149922
2,Normalization Model,0.733766,0.819218,8.545201
3,Hyperparameter Tuning,0.733766,0.791531,5.776471
4,Standardization Model,0.74026,0.815961,7.570117
5,Hyperparameter Tuning,0.714286,0.801303,8.701722


In [None]:
# Create Functions
normalization

Standardization()

train_test_split()

model_training()

model_evaluation_testing()

model_evaluation_training()

get_best_hyperparameter()

get_hyp_testing_accuracy()

get_hyp_training_accuracy()