In [1]:
import numpy as np
import pandas as pd

Dataset : https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

Description of the dataset:

The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#numbers of rows and columns in this dataframe
df.shape

(768, 9)

In [5]:
#getting the statistical measures of the data
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
#separating data and labels

x = df.iloc[: , :-1].values
y = df['Outcome']


###Splitting the dataset

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, stratify = y , random_state = 42)


In [10]:
print(x_train)

[[1.00e+00 9.00e+01 6.20e+01 ... 2.72e+01 5.80e-01 2.40e+01]
 [5.00e+00 1.26e+02 7.80e+01 ... 2.96e+01 4.39e-01 4.00e+01]
 [2.00e+00 1.05e+02 5.80e+01 ... 3.49e+01 2.25e-01 2.50e+01]
 ...
 [1.00e+00 9.70e+01 7.00e+01 ... 3.81e+01 2.18e-01 3.00e+01]
 [1.00e+01 1.11e+02 7.00e+01 ... 2.75e+01 1.41e-01 4.00e+01]
 [4.00e+00 1.44e+02 5.80e+01 ... 2.95e+01 2.87e-01 3.70e+01]]


In [11]:
print(y_train)

353    0
711    0
373    0
46     0
682    0
      ..
451    1
113    0
556    0
667    1
107    0
Name: Outcome, Length: 614, dtype: int64


###Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

In [13]:
print(x_train)

[[-0.85135507 -0.98013068 -0.40478372 ... -0.60767846  0.31079384
  -0.79216928]
 [ 0.35657564  0.16144422  0.46536842 ... -0.30213902 -0.11643851
   0.56103382]
 [-0.5493724  -0.50447447 -0.62232176 ...  0.3725939  -0.76486207
  -0.70759409]
 ...
 [-0.85135507 -0.75815778  0.03029235 ...  0.77997981 -0.78607218
  -0.28471812]
 [ 1.86648903 -0.31421198  0.03029235 ... -0.56948603 -1.01938346
   0.56103382]
 [ 0.05459296  0.73223168 -0.62232176 ... -0.31486983 -0.57700104
   0.30730824]]


In [14]:
print(x_test)

[[ 0.96054099  1.20788789 -0.29601471 ... -0.58221684 -0.55579092
   0.56103382]
 [ 1.86648903 -1.67775979  1.98813468 ...  0.44897876 -0.58306107
   1.15306018]
 [-0.5493724   0.03460257  0.3565994  ...  0.499902    0.01688223
  -0.6230189 ]
 ...
 [-0.5493724  -1.23381399 -0.94862882 ... -0.44217793  3.70138246
  -0.70759409]
 [ 0.05459296  2.00064824  0.46536842 ...  0.6399409  -0.64669142
  -0.20014293]
 [-0.85135507 -1.58262854  0.46536842 ...  0.15617013 -0.16794879
  -1.04589487]]


In [15]:
print(x.shape, x_train.shape, x_test.shape)

(768, 8) (614, 8) (154, 8)


#Support Vector Classifier

In [28]:
from sklearn.svm import SVC
svc_classifier = SVC(kernel = 'linear', random_state = 42)
svc_classifier.fit(x_train , y_train)

###Predicting the Test set results

In [29]:
y_pred = svc_classifier.predict(x_test)

In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)


0.7207792207792207

#Logistic regression Model

Training the model


In [31]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression( random_state = 0)
lr_classifier.fit(x_train, y_train)


predicting test set results

In [32]:
y_pred = lr_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.7142857142857143

# K Nearest Neighbours

In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 8)
knn_classifier.fit(x_train, y_train)


In [34]:
y_pred = knn_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.6818181818181818

#Kernel SVM

In [35]:
from sklearn.svm import SVC

ksvc_classifier = SVC(kernel ='rbf', random_state=17)
ksvc_classifier.fit(x_train , y_train)


In [36]:
y_pred = ksvc_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.7532467532467533

#Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()
nb_classifier.fit(x_train, y_train)

In [39]:
y_pred = nb_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.7077922077922078

#Decision Tree Classification

In [40]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
dt_classifier.fit(x_train, y_train)

In [41]:
y_pred = dt_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.6818181818181818

#Random Forest Classification

In [42]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 45)
rf_classifier.fit(x_train, y_train)

In [43]:
y_pred = rf_classifier.predict(x_test)

accuracy_score(y_test, y_pred)

0.7597402597402597

The best accuracy among the all the models above was 75.97% of the random forest classification model.

Predicitve System based on Random forest Classification Model

In [46]:
#taking the input from the user for the various features

Preg=int(input("Enter the number of Pregnancies of the Patient:"))
glucose = int(input("Enter the number of the Glucose level of the patient:"))
blood_pressure = (input("Enter the blood pressure of the patient:"))
skinThickness = int(input("enter the SkinThickness of the patient:"))
insulin = int(input("Enter the insulin level of the patient:"))
bmi = float(input('Enter the BMI of the patient:'))
diabetes_pedigree_function = float(input("Enter the value of Diabetes Pedigree Fucntion of the patient:"))
age = int(input("Enter the age of the patient"))



input_data = (Preg, glucose, blood_pressure, skinThickness, insulin, bmi, diabetes_pedigree_function, age)

# #changing the input data to numpy array
input_data_array = np.array(input_data)

# #reshape the array as we are prediciton for one instance
input_data_reshaped = input_data_array.reshape(1,-1)

# #standardizing the above data as the model was trained on a standardized dataset
std_data = sc.transform(input_data_reshaped)

print()

prediction = rf_classifier.predict(std_data)

if prediction[0] == 0:
  print("The person is NOT diabetic")
else:
  print("The person is diabetic")

Enter the number of Pregnancies of the Patient:4
Enter the number of the Glucose level of the patient:110
Enter the blood pressure of the patient:92
enter the SkinThickness of the patient:0
Enter the insulin level of the patient:0
Enter the BMI of the patient:37.6
Enter the value of Diabetes Pedigree Fucntion of the patient:0.191
Enter the age of the patient30

The person is NOT diabetic


The input data was given as:          

input_data = (4, 110 , 92, 0, 0, 37.6, 0.191,30) an the outcome was 0

i.e. the patient is not diabetic.

Hence our model predicted the correct result which was " The person is NOT diabetic"