## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Importing the dataset

In [2]:
data = pd.read_csv('diabetes.csv')

## Getting some informations about the dataset

In [3]:
data.shape

(768, 9)

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
# Getting some statistical measures about the dataset
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# The count of the target variable
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Getting the indpendent and the dependent variables from the dataset

In [8]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [9]:
print(X.shape)
print(y.shape)

(768, 8)
(768,)


## Splitting the dataset into the training set and the test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 8)
(154, 8)
(614,)
(154,)


# Model Selection

In [12]:
# Defining a scores dictionary
scores = {
    'training_score': [],
    'test_score': []
}

## 1- The logistic regression model

In [13]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()
classifier1.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from sklearn.metrics import accuracy_score
training_score1 = accuracy_score(y_train, classifier1.predict(X_train))
test_score1 = accuracy_score(y_test, classifier1.predict(X_test))
print(f"The training score: {training_score1}")
print(f"The test score: {test_score1}")
scores['training_score'].append(training_score1)
scores['test_score'].append(test_score1)

The training score: 0.7719869706840391
The test score: 0.7987012987012987


## 2- The K-Nearest neighbors model

In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier()
classifier2.fit(X_train,y_train)

In [16]:
training_score2 = accuracy_score(y_train, classifier2.predict(X_train))
test_score2 = accuracy_score(y_test, classifier2.predict(X_test))
print(f"The training score: {training_score2}")
print(f"The test score: {test_score2}")
scores['training_score'].append(training_score2)
scores['test_score'].append(test_score2)

The training score: 0.7882736156351792
The test score: 0.7857142857142857


## 3- The support vector machine model

In [17]:
from sklearn.svm import SVC
classifier3 = SVC()
classifier3.fit(X_train,y_train)

In [18]:
training_score3 = accuracy_score(y_train, classifier3.predict(X_train))
test_score3 = accuracy_score(y_test, classifier3.predict(X_test))
print(f"The training score: {training_score3}")
print(f"The test score: {test_score3}")
scores['training_score'].append(training_score3)
scores['test_score'].append(test_score3)

The training score: 0.7687296416938111
The test score: 0.7662337662337663


## 4- The naive bayes model

In [19]:
from sklearn.naive_bayes import GaussianNB
classifier4 = GaussianNB()
classifier4.fit(X_train,y_train)

In [20]:
training_score4 = accuracy_score(y_train, classifier4.predict(X_train))
test_score4 = accuracy_score(y_test, classifier4.predict(X_test))
print(f"The training score: {training_score4}")
print(f"The test score: {test_score4}")
scores['training_score'].append(training_score4)
scores['test_score'].append(test_score4)

The training score: 0.754071661237785
The test score: 0.7727272727272727


## 5- The decision tree classification model

In [21]:
from sklearn.tree import DecisionTreeClassifier
classifier5 = DecisionTreeClassifier()
classifier5.fit(X_train,y_train)

In [22]:
training_score5 = accuracy_score(y_train, classifier5.predict(X_train))
test_score5 = accuracy_score(y_test, classifier5.predict(X_test))
print(f"The training score: {training_score5}")
print(f"The test score: {test_score5}")
scores['training_score'].append(training_score5)
scores['test_score'].append(test_score5)

The training score: 1.0
The test score: 0.6948051948051948


## 6- The random forest classification model

In [23]:
from sklearn.ensemble import RandomForestClassifier
classifier6 = RandomForestClassifier()
classifier6.fit(X_train,y_train)

In [24]:
training_score6 = accuracy_score(y_train, classifier6.predict(X_train))
test_score6 = accuracy_score(y_test, classifier6.predict(X_test))
print(f"The training score: {training_score6}")
print(f"The test score: {test_score6}")
scores['training_score'].append(training_score6)
scores['test_score'].append(test_score6)

The training score: 1.0
The test score: 0.8051948051948052


## Comparing the results given by each model

In [25]:
indexes = ['The logistic regression model',
           'The K-Nearest neighbors model',
           'The support vector machine model',
           'The naive bayes model',
           'The decision tree classification model',
           'The random forest classification model']
df_scores = pd.DataFrame(scores, index=indexes)

In [26]:
df_scores

Unnamed: 0,training_score,test_score
The logistic regression model,0.771987,0.798701
The K-Nearest neighbors model,0.788274,0.785714
The support vector machine model,0.76873,0.766234
The naive bayes model,0.754072,0.772727
The decision tree classification model,1.0,0.694805
The random forest classification model,1.0,0.805195


# Selecting the K-Nearest neighbors model as the best model

## Building a predictive system using the K-Nearest neighbors model

In [27]:
input_data = np.array([[9,171,110,24,240,45.4,0.721,54]])
y_pred = classifier2.predict(input_data)
print(y_pred)
if y_pred[0] == 1:
  print(f"This person is diabetic!!")
else:
  print(f"This person is not diabetic!!")

[1]
This person is diabetic!!


## Saving the trained model (The K-Nearest neighbors model)

In [28]:
import pickle
pickle.dump(classifier2, open('diabetes_disease_model.sav','wb'))