In [1]:
# Load libraries
import pandas as pd
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import StandardScaler # feature scaling to the training and test set of independent variables
from sklearn.naive_bayes import GaussianNB # Training the Naive Bayes model on the training set
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)

In [3]:
pima.head()


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [4]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable
X = X[1:]
X
y = y[1:]

In [5]:
X

Unnamed: 0,pregnant,insulin,bmi,age,glucose,bp,pedigree
1,6,0,33.6,50,148,72,0.627
2,1,0,26.6,31,85,66,0.351
3,8,0,23.3,32,183,64,0.672
4,1,94,28.1,21,89,66,0.167
5,0,168,43.1,33,137,40,2.288
...,...,...,...,...,...,...,...
764,10,180,32.9,63,101,76,0.171
765,2,0,36.8,27,122,70,0.34
766,5,112,26.2,30,121,72,0.245
767,1,0,30.1,47,126,60,0.349


In [6]:
y

1      1
2      0
3      1
4      0
5      1
      ..
764    0
765    0
766    0
767    1
768    0
Name: label, Length: 768, dtype: object

In [7]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_test


Unnamed: 0,pregnant,insulin,bmi,age,glucose,bp,pedigree
286,7,135,26,51,136,74,0.647
102,1,0,26.1,22,151,60,0.179
582,6,0,25,27,109,60,0.206
353,3,0,34.4,46,61,82,0.243
727,1,180,36.1,25,116,78,0.496
...,...,...,...,...,...,...,...
242,4,88,33.1,22,91,70,0.446
600,1,120,23.1,26,109,38,0.407
651,1,100,25.2,23,91,54,0.234
12,10,0,38,34,168,74,0.537


In [8]:
X_test

Unnamed: 0,pregnant,insulin,bmi,age,glucose,bp,pedigree
286,7,135,26,51,136,74,0.647
102,1,0,26.1,22,151,60,0.179
582,6,0,25,27,109,60,0.206
353,3,0,34.4,46,61,82,0.243
727,1,180,36.1,25,116,78,0.496
...,...,...,...,...,...,...,...
242,4,88,33.1,22,91,70,0.446
600,1,120,23.1,26,109,38,0.407
651,1,100,25.2,23,91,54,0.234
12,10,0,38,34,168,74,0.537


In [9]:
X_train

Unnamed: 0,pregnant,insulin,bmi,age,glucose,bp,pedigree
89,15,110,37.1,43,136,70,0.153
468,0,100,36.8,25,97,64,0.6
551,1,0,27.4,21,116,70,0.204
148,2,119,30.5,34,106,64,1.4
482,0,0,35.2,29,123,88,0.197
...,...,...,...,...,...,...,...
646,2,440,39.4,30,157,74,0.134
716,7,392,33.9,34,187,50,0.826
73,13,0,43.4,42,126,90,0.583
236,4,0,43.6,26,171,72,0.479


In [10]:
# Next, we are doing feature scaling to the training and test set of independent variables
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
X_train

array([[ 3.3536084 ,  0.28859739,  0.68611992, ...,  0.48047259,
         0.05216975, -0.94690147],
       [-1.12101735,  0.19979565,  0.64699564, ..., -0.76891126,
        -0.2463932 ,  0.39612987],
       [-0.82270897, -0.68822177, -0.5788985 , ..., -0.16023707,
         0.05216975, -0.7936697 ],
       ...,
       [ 2.75699163, -0.68822177,  1.50772982, ...,  0.16011776,
         1.04737957,  0.34505262],
       [ 0.07221618, -0.68822177,  1.53381267, ...,  1.60171451,
         0.15169073,  0.03258   ],
       [ 1.5637581 , -0.68822177,  0.13837998, ..., -0.60873384,
         0.3507327 ,  0.59142526]])

In [12]:
X_test

array([[ 0.96714133,  0.51060175, -0.76147848, ...,  0.48047259,
         0.25121172,  0.53734346],
       [-0.82270897, -0.68822177, -0.74843705, ...,  0.96100484,
        -0.44543516, -0.86878331],
       [ 0.66883295, -0.68822177, -0.89189275, ..., -0.38448546,
        -0.44543516, -0.78766061],
       ...,
       [-0.82270897,  0.19979565, -0.8658099 , ..., -0.96112415,
        -0.74399811, -0.70353337],
       [ 1.86206648, -0.68822177,  0.80349276, ...,  1.50560806,
         0.25121172,  0.20684358],
       [ 1.5637581 ,  0.86580872,  0.30791853, ..., -0.28837901,
         0.64929565, -0.62541522]])

In [14]:
#Training the Naive Bayes model on the training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [15]:
#Predict the response for test dataset
y_pred = classifier.predict(X_test)

In [16]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7748917748917749


In [17]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [18]:
ac

0.7748917748917749

In [19]:
cm

array([[128,  18],
       [ 34,  51]], dtype=int64)

In [38]:
# KNN Algorithm - Training and Prediction
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [39]:
#Predict the response for test dataset
y_pred = classifier.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [41]:
ac

0.7835497835497836

In [42]:
cm

array([[129,  17],
       [ 33,  52]], dtype=int64)