In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
df = pd.read_csv('../input/diabetes.csv')

In [None]:
X = df.drop('Outcome',axis=1).values
y = df['Outcome'].values

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
#Fit the model
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
knn.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
#grid_params={'n_neighbors':np.arange(1,20),'weights':['uniform','distance'],'metric':['euclidean','manhattan']}
grid_params={'n_neighbors':np.arange(1,20),'weights':['uniform'],'metric':['euclidean']}

gs=GridSearchCV(KNeighborsClassifier(),grid_params,verbose=1,cv=5,n_jobs=-1)
gs_results=gs.fit(X_train,y_train)
gs_test=gs.fit(X_test,y_test)

In [None]:
cv_results = pd.DataFrame(gs.cv_results_)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_neighbors"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_neighbors"], cv_results["mean_train_score"])
plt.xlabel('number of neighbors')
plt.ylabel('accuracy')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')

In [None]:
print(gs_results.best_score_)
print(gs_results.best_estimator_)
print(gs_results.best_params_)
print(gs_test.best_score_)
print(gs_test.best_estimator_)
print(gs_test.best_params_)

In [None]:
print(gs_results.grid_scores_[0].parameters)
print(gs_results.grid_scores_[0].cv_validation_scores)
print(gs_results.grid_scores_[0].mean_validation_score)
grid_mean_scores = [result.mean_validation_score for result in gs_results.grid_scores_]
print(grid_mean_scores)

In [None]:
#import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#Setup arrays to store training and test accuracies
neighbors = np.arange(1,9)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test) 

In [None]:
#Generate plot
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#We can observe above that we get maximum testing accuracy for k=7. So lets create a KNeighborsClassifier with number of neighbors as 7.
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
#Fit the model
knn.fit(X_train,y_train)

In [None]:
#Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn.score(X_test,y_test)

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix

In [None]:
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
#import classification_report
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)