## Find clusters of diabetes patients 

Utilizing Pima Indians Diabetes Database from kaggle, utilize suprvised learning KNN model to find clusters of diabetes patients based on various input features from the data.


In [43]:
import pandas as pd

# read diabetes dataset
input_data_path = '/Users/mmichalski/dev/ml/scikit-projects/data/diabetes.csv'
df = pd.read_csv(input_data_path)

df.sample(n=5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
491,2,89,90,30,0,33.5,0.292,42,0
716,3,173,78,39,185,33.8,0.97,31,1
563,6,99,60,19,54,26.9,0.497,32,0
230,4,142,86,0,0,44.0,0.645,22,1
189,5,139,80,35,160,31.6,0.361,25,1


In [44]:
df.shape

(768, 9)

In [45]:
# create training data frame dropping outcome column that specifies if a patient has diebetes
X = df.drop(columns=['Outcome'])
X.head(5)

# define the target variable data
y = df['Outcome'].values

In [46]:
from sklearn.model_selection import train_test_split

# Split the data into train/test into 80/20 size and stratify the training outcome data by 25/75
# 25% that have diabetes and 75% that don't 
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)


In [47]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

# Fit the Knearest neighbors classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [48]:
# run the prediction on test data set of first 5 records
knn.predict(X_test)[0:5]

array([0, 0, 0, 0, 1])

In [49]:
# measure the accuracy of the model

test_acc = knn.score(X_test, y_test)
print(test_acc)

0.6688311688311688


In [50]:
# Add k-fold (5 fold) cross validation to determine a mean score and see how the
# model will perform in the wild on the onseen data
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(knn, X, y, cv=5)

#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.68181818 0.69480519 0.75324675 0.75163399 0.68627451]
cv_scores mean:0.7135557253204311


In [76]:
# Hypertune the KNN model using grid search and random search
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


# construct the set of hyperparameters to tune
# distance between nearest neighbors metric to use
params = {"n_neighbors": np.arange(1, 35, 2), 
          "metric": ["euclidean", "manhattan", "minkowski"]}

print('Grid searching through following parameters: {}'.format(params))

grid = GridSearchCV(knn, params)
start = time.time()
grid.fit(X, Y)

# evaluate the best grid searched model on the testing data
print("[INFO] grid search took {:.2f} seconds".format(
	time.time() - start))
grid_acc = grid.score(X_test, y_test)
print("[INFO] grid search accuracy: {:.2f}%".format(grid_acc * 100))
print("[INFO] grid search best parameters: {}".format(
	grid.best_params_))


# Now highperparm search using random search method
random_search = RandomizedSearchCV(knn, params)
start = time.time()
random_search.fit(X, y)
 
# evaluate the best randomized searched model on the testing
# data
print("[INFO] randomized search took {:.2f} seconds".format(
	time.time() - start))
random_acc = random_search.score(X_test, y_test)
print("[INFO] grid search accuracy: {:.2f}%".format(random_acc * 100))
print("[INFO] randomized search best parameters: {}".format(
	random_search.best_params_))

if grid_acc >= random_acc:
    print('gird hyperparameter search wins with following params: {}'.format(grid.best_params_))
else:
    print('random hyperparameter search wins with following params: {}'.format(random_search.best_params_))   
                                                                               
                                                                               

Grid searching through following parameters: {'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33]), 'metric': ['euclidean', 'manhattan', 'minkowski']}




[INFO] grid search took 1.75 seconds
[INFO] grid search accuracy: 77.92%
[INFO] grid search best parameters: {'metric': 'manhattan', 'n_neighbors': 15}




[INFO] randomized search took 0.34 seconds
[INFO] grid search accuracy: 76.62%
[INFO] randomized search best parameters: {'n_neighbors': 13, 'metric': 'manhattan'}
gird hyperparameter search wins with following params: {'metric': 'manhattan', 'n_neighbors': 15}


In [77]:
#manually search through hyperparam space
k_range = range(1, 31)

# list of scores from k_range
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())
print(k_scores)

[0.6796650717703349, 0.7122351332877648, 0.7030587833219413, 0.7187115516062884, 0.7213773069036227, 0.7357142857142858, 0.7396274777853726, 0.7383116883116883, 0.7383458646616542, 0.7434723171565277, 0.7369446343130555, 0.7473684210526316, 0.7422077922077922, 0.7539131920710869, 0.7448051948051948, 0.7526144907723855, 0.7552973342447027, 0.7552802460697198, 0.7474709501025291, 0.7461893369788107, 0.7500683526999316, 0.7501196172248804, 0.7475222146274778, 0.7435919343814081, 0.7462064251537937, 0.7331681476418319, 0.7370813397129188, 0.7305365686944635, 0.7318523581681476, 0.7253588516746412]


In [72]:
import matplotlib.pyplot as plt
%matplotlib inline

