## Importing libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
diabetes = pd.read_csv('diabetes.csv')

## Getting informations about the dataset

In [None]:
diabetes.shape

(768, 9)

In [None]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
diabetes.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [None]:
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Checking how the features affect the target variable

In [None]:
diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


## Checking if there is any missing values in each column of the dataset

In [None]:
diabetes.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## The number of unique values in each column of the dataset

In [None]:
diabetes.nunique()

Pregnancies                  17
Glucose                     136
BloodPressure                47
SkinThickness                51
Insulin                     186
BMI                         248
DiabetesPedigreeFunction    517
Age                          52
Outcome                       2
dtype: int64

## The independent variables and the dependent variable

In [None]:
X = diabetes.iloc[:, :-1].values
y = diabetes.iloc[:, -1].values

In [None]:
print(X.shape)
print(y.shape)

(768, 8)
(768,)


## Spliting the dataset into the training set and the test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

## Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Evaluating models

## 1- Logistic Regression model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
classifier1 = LogisticRegression()
classifier1.fit(X_train, y_train)
y_pred1 = classifier1.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred1)
score1 = accuracy_score(y_test, y_pred1)
print(cm1)
print(score1)

[[89 11]
 [26 28]]
0.7597402597402597


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier1, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.78
Standard Deviation: 4.58%


## 2- K-nearest neighbors (KNN) model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
classifier2.fit(X_train, y_train)
y_pred2 = classifier2.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred2)
score2 = accuracy_score(y_test, y_pred2)
print(cm2)
print(score2)

[[87 13]
 [31 23]]
0.7142857142857143


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier2, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.73
Standard Deviation: 4.38%


## 3- Support vector machine (SVM) model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.svm import SVC
classifier3 = SVC(kernel='linear')
classifier3.fit(X_train, y_train)
y_pred3 = classifier3.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred3)
score3 = accuracy_score(y_test, y_pred3)
print(cm3)
print(score3)

[[91  9]
 [26 28]]
0.7727272727272727


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier3, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.77
Standard Deviation: 5.29%


## 4- Naive bayes model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.naive_bayes import GaussianNB
classifier4 = GaussianNB()
classifier4.fit(X_train, y_train)
y_pred4 = classifier4.predict(X_test)
cm4 = confusion_matrix(y_test, y_pred4)
score4 = accuracy_score(y_test, y_pred4)
print(cm4)
print(score4)

[[88 12]
 [23 31]]
0.7727272727272727


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier4, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.75
Standard Deviation: 4.04%


## 5- Decision tree classification model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.tree import DecisionTreeClassifier
classifier5 = DecisionTreeClassifier()
classifier5.fit(X_train, y_train)
y_pred5 = classifier5.predict(X_test)
cm5 = confusion_matrix(y_test, y_pred5)
score5 = accuracy_score(y_test, y_pred5)
print(cm5)
print(score5)

[[83 17]
 [29 25]]
0.7012987012987013


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier5, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.69
Standard Deviation: 5.39%


## 6- Random Forest Classification model

In [None]:
# 1- Training the model on the training set
# 2- Evaluating the model on the test set
from sklearn.ensemble import RandomForestClassifier
classifier6 = RandomForestClassifier(n_estimators=10)
classifier6.fit(X_train, y_train)
y_pred6 = classifier6.predict(X_test)
cm6 = confusion_matrix(y_test, y_pred6)
score6 = accuracy_score(y_test, y_pred6)
print(cm6)
print(score6)

[[90 10]
 [34 20]]
0.7142857142857143


In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuarcies = cross_val_score(estimator=classifier6, X=X_train, y=y_train, cv=10)
print(f"The average accuracy: {accuarcies.mean():.2f}")
print(f"Standard Deviation: {accuarcies.std() * 100:.2f}%")

The average accuracy: 0.73
Standard Deviation: 4.12%


## Predicting if a person is diabetic or not using the SVM model

In [46]:
example = np.array([4,103,60,33,192,24,0.966,33]).reshape(1,-1)
y_pred = classifier3.predict(sc.transform(example))
print(y_pred)

[0]
