In [1]:
#import packages
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

#import data
diabetes_data = pd.read_csv('02_DiabetesDataset.csv')

In [2]:
#outcome column is either 0 or 1 so use Classification model
diabetes_data 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [28]:
#KNN Classifier and training the KNN Classifier model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
KNN_clss= KNeighborsClassifier(n_neighbors=3)
X,y=diabetes_data.iloc[:,:-1], diabetes_data.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X,y,random_state=0, test_size=.2)
KNN_clss.fit(X_train,y_train)
KNN_clss.score(X_test, y_test)

0.7207792207792207

In [5]:
#checking X for correct rows and columns 
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [6]:
#checking y for correct rows and columns 
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [7]:
#standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [8]:
#check for standardized data
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [9]:
#Run KNN Classifier w Standardized data
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=0, test_size=.2)
KNN_clss.fit(X_train, y_train)
KNN_clss.score(X_test, y_test)

0.7467532467532467

In [10]:
#Train a Classifier for different values of K

results=[]
for K in range(1,25):
    KNN_cls_K = KNeighborsClassifier(n_neighbors=K)
    KNN_cls_K.fit(X_train, y_train)
    results.append({'K':K, 'accuracy_train': KNN_cls_K.score(X_train, y_train), 'accuracy_test':KNN_cls_K.score(X_test, y_test)})

#Convert results to a pandas data frame 
results = pd.DataFrame(results)
print (results)

     K  accuracy_train  accuracy_test
0    1        1.000000       0.720779
1    2        0.830619       0.766234
2    3        0.850163       0.746753
3    4        0.812704       0.785714
4    5        0.812704       0.805195
5    6        0.783388       0.798701
6    7        0.788274       0.779221
7    8        0.768730       0.792208
8    9        0.785016       0.772727
9   10        0.778502       0.772727
10  11        0.791531       0.779221
11  12        0.775244       0.779221
12  13        0.768730       0.785714
13  14        0.763844       0.792208
14  15        0.755700       0.811688
15  16        0.755700       0.818182
16  17        0.770358       0.811688
17  18        0.763844       0.811688
18  19        0.776873       0.805195
19  20        0.768730       0.805195
20  21        0.780130       0.805195
21  22        0.768730       0.792208
22  23        0.776873       0.792208
23  24        0.763844       0.792208


In [27]:
#Run KNN Classifier with improved K value

KNN_clss= KNeighborsClassifier(n_neighbors=5)
X,y=diabetes_data.iloc[:,:-1], diabetes_data.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X,y,random_state=0, test_size=.15)
KNN_clss.fit(X_train,y_train)
KNN_clss.score(X_test, y_test)

0.7758620689655172

In [24]:
#predict observations with array P
KNN_clss.predict([[3,150,80,22,10,40,2.3,66]])

array([1], dtype=int64)

In [None]:
#Based on the model above, the prediction for the patient with array P is Outcome 1. 