### Importing the Libraries


In [95]:
import pandas as pd, scipy, numpy as np
import sklearn.preprocessing

### Loading the iris dataset

In [96]:
headernames = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome'] #Assigning the headers

In [97]:
ds = pd.read_csv('diabetes.csv', names = headernames)
ds.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [98]:
print(ds)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
6              3       78             50             32       88  31.0   
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
10             4      110             92              0        0  37.6   
11            10      168             74              0        0  38.0   
12            10      139             

### Splitting up in feature attributes and class variable

In [99]:
x = ds.iloc[:, :-1].values

In [100]:
y=ds.iloc[:, 8].values

In [101]:
print(ds)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
6              3       78             50             32       88  31.0   
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
10             4      110             92              0        0  37.6   
11            10      168             74              0        0  38.0   
12            10      139             

### Train and Test Split
Next, we will divide the data into train and test split. Following code will split the dataset into 60% training data and 40% of testing data

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

### Data Scaling using the StandardScaler

In [103]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Training a KNN Classifier

In [104]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

### Making the Predictions

In [105]:
y_pred = classifier.predict(X_test)

### Output

In [106]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[175  27]
 [ 55  51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       202
           1       0.65      0.48      0.55       106

    accuracy                           0.73       308
   macro avg       0.71      0.67      0.68       308
weighted avg       0.72      0.73      0.72       308

Accuracy: 0.7337662337662337


In [107]:
Y_pred = classifier.predict(X_test)
#print(Y_pred)

### K-fold Crossvalidation
K-Folds cross-validator provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default). Each fold is then used once as a validation while the k - 1 remaining folds form the training set.

more information on https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [108]:
from sklearn.model_selection import KFold
# prepare cross validation
kfold = KFold(2, True) # value of K and shuffle? 
# enumerate splits
for train, test in kfold.split(X_train):
    print('train: %s, test: %s' % (X_train[train], X_train[test]))
   

train: [[ 1.16735096 -0.93780954  0.66327222 ...  0.44676103  0.31596679
   2.92873113]
 [ 0.87740418 -0.56339115 -0.14790816 ...  0.88484207 -0.40214247
  -0.20926093]
 [ 1.74724453 -1.43703407  0.66327222 ...  0.1588792  -0.64151222
   0.38441325]
 ...
 [ 0.58745739 -0.71939882 -0.55349836 ...  0.24649541 -0.14799681
   0.80846623]
 [ 1.74724453  0.12304257  0.05488693 ... -0.11648602 -0.81291279
   0.63884503]
 [-0.57232974  0.02943797 -0.857691   ...  0.52186063  0.9927035
  -0.46369272]], test: [[-0.86227652  0.99668548  0.66327222 ...  1.07259108  0.611485
  -0.8877457 ]
 [-1.1522233  -0.43858502  0.35907958 ...  1.66087133  0.60852982
  -0.8029351 ]
 [ 0.29751061 -0.18897276  1.47445261 ...  2.61213301 -0.80109206
  -0.46369272]
 ...
 [ 0.87740418 -0.75060035  0.35907958 ...  1.11014089  1.15523852
  -0.12445033]
 [ 0.87740418  2.30714985  0.05488693 ... -0.86748208 -0.93703044
   1.82619338]
 [-0.86227652 -1.2810264  -0.70559468 ... -1.61847815 -0.65628813
  -1.05736689]]
train

In [109]:
for train, test in kfold.split(X_train):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    result = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(result)
    result1 = classification_report(y_test, y_pred)
    print("Classification Report:",)
    print (result1)
    result2 = accuracy_score(y_test,y_pred)
    print("Accuracy:",result2)

Confusion Matrix:
[[175  27]
 [ 55  51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       202
           1       0.65      0.48      0.55       106

    accuracy                           0.73       308
   macro avg       0.71      0.67      0.68       308
weighted avg       0.72      0.73      0.72       308

Accuracy: 0.7337662337662337
Confusion Matrix:
[[175  27]
 [ 55  51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       202
           1       0.65      0.48      0.55       106

    accuracy                           0.73       308
   macro avg       0.71      0.67      0.68       308
weighted avg       0.72      0.73      0.72       308

Accuracy: 0.7337662337662337
