In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [30]:
#Load Dataset
dataset = pd.read_csv('Downloads/diabetes.csv')

In [32]:
print(dataset)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [31]:
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [33]:
print(dataset['BMI'])

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64


In [34]:
# Replacing Zeros
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NAN) # replacing 0 into NA
    mean = int(dataset[column].mean(skipna= True))
    dataset[column] = dataset[column].replace(np.NAN, mean)

In [35]:
print(dataset['Insulin'])

0      155.0
1      155.0
2      155.0
3       94.0
4      168.0
       ...  
763    180.0
764    155.0
765    112.0
766    155.0
767    155.0
Name: Insulin, Length: 768, dtype: float64


In [36]:
#Split data
x = dataset.iloc[:,0:8]
y = dataset.iloc[:,8]

print(x.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0           72.0           35.0    155.0  33.6   
1            1     85.0           66.0           29.0    155.0  26.6   
2            8    183.0           64.0           29.0    155.0  23.3   
3            1     89.0           66.0           23.0     94.0  28.1   
4            0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  


In [37]:
#Split dataset for testing and training ...
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0, test_size=0.2)

In [38]:
# Feature Scaling
sc_X = StandardScaler() # All the data can be located from -1 to +1 for keeping it as standard form
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.fit_transform(x_test)

In [39]:
#Define the model
classifer = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean') #p -> power

#Fit the model
classifer.fit(x_train,y_train)

In [40]:
#Predict the test set result
y_pred = classifer.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [41]:
#Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[95 12]
 [18 29]]


In [23]:
print(f1_score(y_test,y_pred)) # Range -> 0 to +1 (0 is worst case and +1 is the best case)

0.6590909090909092


In [24]:
print(accuracy_score(y_test,y_pred))

0.8051948051948052


**Hyperparameter Tuning**

In [43]:
#List Hyperparameter that we want to tune
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p = [1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size= leafe_size, n_neighbors = n_neighbors, p=p)

#Define the model
knn = KNeighborsClassifier()

#Use Grid search
clf = GridSearchCV(knn, hyperparameters, cv=10) #cv -> cross validation

#Fit the model
best_model = clf.fit(x_train, y_train)

#Print the best Hyperparameters
print('Best leafe size : ', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p : ', best_model.best_estimator_.get_params()['p'])
print('Best n_neigbours : ', best_model.best_estimator_.get_params()['n_neighbors'])

Best leafe size :  1
Best p :  1
Best n_neigbours :  11


In [45]:
y_pred = clf.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [46]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [16 31]]


In [47]:
print(f1_score(y_test, y_pred))

0.6813186813186813


In [48]:
print(accuracy_score(y_test, y_pred))

0.8116883116883117
