In [1]:
import pandas as pd

url = "data/pima-indians-diabetes.csv"
cols = ['pregnancy', 'glucose', 'bp', 'skinthickness', 'insulin', 'bmi', 'pedigree', 'age', 'diabetic']

data = pd.read_csv(url, header=None, names=cols)
data.head()

Unnamed: 0,pregnancy,glucose,bp,skinthickness,insulin,bmi,pedigree,age,diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
X = data.iloc[:, 0:8].copy()
y = data['diabetic']

print(X.shape)
print(y.shape)

(768, 8)
(768,)


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

knn = KNeighborsClassifier(n_neighbors=17)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

confusion = metrics.confusion_matrix(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
print(confusion)
#               Predicted
#                 0    1
# Actual 0   - [[116  14]
# Actual 1   - [ 31  31]]

print(y_test.values[0:20])
print(y_pred[0:20])
#[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0]
#[1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0]

TN = confusion[0,0] #True Negative
TP = confusion[1,1] #True Positive
FP = confusion[0,1] #False Positive
FN = confusion[1,1] #False Negative

0.765625
[[116  14]
 [ 31  31]]
[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0]
[1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0]


In [4]:
#How often is the classifier correct - Classification Accuracy
print((TN + TP) / float(TN + TP + FN + FP))
print(metrics.accuracy_score(y_test, y_pred))

0.765625
0.765625


In [5]:
#How often is the classifier incorrect - Classification Error / Misclassification Rate
print((FN + FP) / float(TN + TP + FN + FP))
print(1 - metrics.accuracy_score(y_test, y_pred))

0.234375
0.234375


In [6]:
#When the actual value is 1, how often did the classifier predicted 1 - Sensitivity (Ideal value : 1)
print(TP / float(TP + FN))
print(metrics.recall_score(y_test, y_pred))

0.5
0.5


In [7]:
#When the actual value is 0, how often did the classifier predicted 0 - Specificity (Ideal value : 1)
print(TN / float(TN + FP))

0.8923076923076924


In [10]:
#False positive rate
print(FP / float(TN + FP))

0.1076923076923077


In [11]:
#Precision
print(TP / float(TP + FP))

0.6888888888888889
