# KNN - Predict whether a person will have diabetes or not

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [26]:
dataset = pd.read_csv('diabetes.csv')

In [27]:
len(dataset)
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
# Replace zeroes
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [29]:
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [30]:
# split dataset
X = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [31]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))


614
614
154
154


In [32]:
#Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [33]:
# Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')

In [34]:
# Fit Model
classifier.fit(X_train, y_train)

In [35]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [36]:
# Evaluate Model
cm = confusion_matrix(y_test, y_pred)
print (cm)
print(f1_score(y_test, y_pred))

[[94 13]
 [15 32]]
0.6956521739130436


In [37]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182


# Creating a new model with tuned Hyperparameters 

In [38]:
# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

In [39]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
# Perform grid search using F1 score as evaluation metric
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='f1')
grid_search.fit(X_train, y_train)

In [40]:

# Print the best hyperparameters and evaluation metric
print("Best parameters: ", grid_search.best_params_)
print("Best f1 score: ", grid_search.best_score_)

Best parameters:  {'n_neighbors': 10, 'p': 2, 'weights': 'distance'}
Best f1 score:  0.6241081229012263


In [41]:

# Train the model with the best hyperparameters
best_knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'], 
                                 weights=grid_search.best_params_['weights'], 
                                 p=grid_search.best_params_['p'])
best_knn.fit(X_train, y_train)

In [42]:
# Evaluate the model on the test set
y_pred = best_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix: ", cm)
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy score: ", accuracy_score(y_test, y_pred))


Confusion matrix:  [[95 12]
 [14 33]]
F1 score:  0.7173913043478262
Accuracy score:  0.8311688311688312
