#  KNN - Predicts whether a person will have diabetes or not

Tutorial and dataset: https://www.youtube.com/watch?v=4HKqjENq9OU

#### First, import the required modules / packages

In [1]:
#import required libraries
import pandas as pd
import numpy as np

# for splitting the dataset into train and test set
from sklearn.model_selection import train_test_split 

#for standardizing features
from sklearn.preprocessing import StandardScaler

#for KNN model
from sklearn.neighbors import KNeighborsClassifier

#for evaluating model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#### Load the dataset:

In [2]:
dataset_file = r"D:\Machine Learning Projects\KNN-20210919T103054Z-001\KNN\diabetes.csv"
dataset = pd.read_csv(dataset_file)
print(dataset.shape)
dataset.head()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
dataset.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


#### Impute zero value by replacing zero with mean

In [5]:
#create list of columns that need to be impute
zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

#First we'll see how many nu
for column in zero_not_accepted:
    print("{}: {} zero values".format(column,(dataset[column] == 0).sum()))

Glucose: 5 zero values
BloodPressure: 35 zero values
SkinThickness: 227 zero values
Insulin: 374 zero values
BMI: 11 zero values


#### ***TODO: Maybe i should check if the data is skewed or not befor using mean?

In [6]:
#calculate mean and replace 0 with mean, column by column
for column in zero_not_accepted:
    mean = int(dataset[dataset[column]!=0][column].mean(skipna=True)) #calculate mean without zero
    dataset[column].replace(0,mean,inplace=True) #replace 0 with mean

In [7]:
#Now I check again for zero values
for column in zero_not_accepted:
    print("{}: {} zero values".format(column,(dataset[column] == 0).sum()))

Glucose: 0 zero values
BloodPressure: 0 zero values
SkinThickness: 0 zero values
Insulin: 0 zero values
BMI: 0 zero values


#### Split the dataset for training and testing

In [8]:
X=dataset.iloc[:,0:8] #split the feature columns
Y=dataset.iloc[:,8] #last column is the result 
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=0,test_size=0.2)

#### Standardizing the data

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) #get the parameters and transform the training set
X_test = scaler.transform(X_test) #transform the testing set

#### Create the model

In [10]:
#first we need to calculate the K number. 
# K = square root of the number of data points, and it needs to be an odd number
import math
print(math.sqrt(len(Y_train)))


24.779023386727733


In [11]:
#n_neighbors is the K, p is the number of outcomes Y
kn = KNeighborsClassifier(n_neighbors=13, p=2, metric="euclidean") 
kn.fit(X_train, Y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=13)

#### Predict the data

In [12]:
Y_predict = kn.predict(X_test)
Y_predict

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

#### Evaluate the model

In [13]:
#compare the Y_predict with Y_test
cm = confusion_matrix(Y_test, Y_predict)
print(cm)

[[95 12]
 [16 31]]


In [14]:
print(f1_score(Y_test,Y_predict))

0.6888888888888888


In [15]:
print(accuracy_score(Y_test,Y_predict))

0.8181818181818182
