## Classify if a given patient has diabetes or not

Utilizing Pima Indians Diabetes Database from kaggle, utilize suprvised learning KNN model to predict if a given patient has diabetes or not.


In [3]:
import pandas as pd

# read diabetes dataset
input_data_path = '/Users/mmichalski/dev/ml/scikit-projects/data/diabetes.csv'
df = pd.read_csv(input_data_path)

df.sample(n=5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
699,4,118,70,0,0,44.5,0.904,26,0
692,2,121,70,32,95,39.1,0.886,23,0
285,7,136,74,26,135,26.0,0.647,51,0
90,1,80,55,0,0,19.1,0.258,21,0
549,4,189,110,31,0,28.5,0.68,37,0


In [4]:
df.shape

(768, 9)

In [5]:
# create training data frame dropping outcome column that specifies if a patient has diebetes
X = df.drop(columns=['Outcome'])
X.head(5)

# define the target variable data
y = df['Outcome'].values

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into train/test into 80/20 size and stratify the training outcome data by 25/75
# 25% that have diabetes and 75% that don't 
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)


In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

# Fit the Knearest neighbors classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [8]:
# run the prediction on test data set of first 5 records
knn.predict(X_test)[0:5]

array([0, 0, 0, 0, 1])

In [9]:
# measure the accuracy of the model

test_acc = knn.score(X_test, y_test)
print(test_acc)

0.6688311688311688


In [10]:
# Add k-fold (5 fold) cross validation to determine a mean score and see how the
# model will perform in the wild on the onseen data
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(knn, X, y, cv=5)

#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.68181818 0.69480519 0.75324675 0.75163399 0.68627451]
cv_scores mean:0.7135557253204311


In [16]:
# Hypertune the KNN model using grid search and random search
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


# construct the set of hyperparameters to tune
# distance between nearest neighbors metric to use
params = {"n_neighbors": np.arange(1, 35, 2), 
          "metric": ["euclidean", "manhattan", "minkowski"]}

print('Grid searching through following parameters: {}'.format(params))

grid = GridSearchCV(knn, params)
start = time.time()
grid.fit(X, y)

# evaluate the best grid searched model on the testing data
print("[INFO] grid search took {:.2f} seconds".format(
	time.time() - start))
grid_acc = grid.score(X_test, y_test)
print("[INFO] grid search accuracy: {:.2f}%".format(grid_acc * 100))
print("[INFO] grid search best parameters: {}".format(
	grid.best_params_))


# Now highperparm search using random search method
random_search = RandomizedSearchCV(knn, params)
start = time.time()
random_search.fit(X, y)
 
# evaluate the best randomized searched model on the testing
# data
print("[INFO] randomized search took {:.2f} seconds".format(
	time.time() - start))
random_acc = random_search.score(X_test, y_test)
print("[INFO] random search accuracy: {:.2f}%".format(random_acc * 100))
print("[INFO] randomized search best parameters: {}".format(
	random_search.best_params_))

if grid_acc >= random_acc:
    print('gird hyperparameter search wins with following params: {}'.format(grid.best_params_))
else:
    print('random hyperparameter search wins with following params: {}'.format(random_search.best_params_))   
                                                                               
                                                                               

Grid searching through following parameters: {'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33]), 'metric': ['euclidean', 'manhattan', 'minkowski']}




[INFO] grid search took 1.69 seconds
[INFO] grid search accuracy: 77.92%
[INFO] grid search best parameters: {'metric': 'manhattan', 'n_neighbors': 15}




[INFO] randomized search took 0.33 seconds
[INFO] random search accuracy: 76.62%
[INFO] randomized search best parameters: {'n_neighbors': 13, 'metric': 'manhattan'}
gird hyperparameter search wins with following params: {'metric': 'manhattan', 'n_neighbors': 15}
