# Diabetes Prediction System:

The main aim of this project is to check whether the particular person is having diabetes or not based on its health reports.

# 1. Get the Data

In [997]:
# import all the necessary libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [998]:
# Read and load the dataset

data = pd.read_csv('diabetes.csv')

In [999]:
# print the dataset

print(data)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [1000]:
# load the first five rows of the dataset

print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [1001]:
# load the last five rows of the dataset

print(data.tail())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
763                     0.171   63        0  
764                     0.340   27        0  
765                     0.245   30        0  
766                     0.349   47        1  
767                     0.315   23        0  


# 2. Process the Data

In [1002]:
# determine the shape of the dataset

print(data.shape)

(768, 9)


In [1003]:
# determine the data types of the dataset 
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [1004]:
# determine the summary statistics of the dataset

print(data.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [1005]:
# Checking if the dataset has null values in it

data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# OBSERVATION:

(i) The dataset has no NULL Values in the dataset.

In [1006]:
data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [1007]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


# 3. Divide the dataset into independent and dependent features

In [1008]:
X = data.drop(columns = 'Outcome')

Y = data['Outcome']

In [1009]:
print(X) 

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [1010]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# 4. Standardization


It is the process of keeping all the input values in the same range so that the data can be trained very easily.

In [1011]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data_x = scaler.fit_transform(X)

print(data_x)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [1012]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


# OBSERVATION

(i)  Now all the input values are in the same range using the Standaization.

# 5. TRAIN_TEST_SPLIT

In [1013]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=25, random_state=1)

# 6. TRAIN THE MODEL USING LOGISTIC REGRESSION



In [1014]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()

In [1015]:
# train the model

log.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1016]:
# Predict the model

pred_train_y = log.predict(X_train)

In [1017]:
pred_test_y = log.predict(X_test)

# 7.  Accuracy of the Model

In [1018]:
from sklearn.metrics import accuracy_score
print('Accuracy score is:', accuracy_score(Y_train, pred_train_y)*100.0)

Accuracy score is: 78.06191117092867


In [1019]:
print('Accuracy score is:', accuracy_score(Y_test, pred_test_y)*100.0)

Accuracy score is: 84.0


# 8. Prediction System

In [1020]:
input_data = (1 , 189, 60, 23, 846, 30.1, 0.398, 59)
input_data_nparray = np.asarray(input_data)
print(input_data_nparray)
reshaped_input_data = input_data_nparray.reshape(1,-1)
print(reshaped_input_data)




[1.00e+00 1.89e+02 6.00e+01 2.30e+01 8.46e+02 3.01e+01 3.98e-01 5.90e+01]
[[1.00e+00 1.89e+02 6.00e+01 2.30e+01 8.46e+02 3.01e+01 3.98e-01 5.90e+01]]


In [1021]:
prediction = log.predict(reshaped_input_data)
print(prediction)



[1]




In [1022]:
if prediction == 1:
    print('this person has a diabetes')
else:
    print("this person has not diabetes")

this person has a diabetes


# 9. TRAIN THE MODEL USING SUPPORT VECTOR MACHINES

In [1023]:
# import the library for support vector machines

from sklearn import svm

In [1024]:
# call the object for support vector machines

model = svm.SVC(kernel='linear')

In [1025]:
# train the model using svm

model.fit(X_train, Y_train)

In [1026]:
# predict the model for support vector machines 

pred_y_train_svm = model.predict(X_train)

In [1027]:
print(pred_y_train_svm)

[0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1
 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0
 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0
 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0
 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0
 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0
 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 1 0 1 

In [1028]:
pred_y_test_svm = model.predict(X_test)

In [1029]:
print(pred_y_test_svm)

[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0]


In [1030]:
# Accuracy of the model using support vector machines

from sklearn.metrics import accuracy_score

ac_train_svm = accuracy_score(pred_y_train_svm, Y_train)

print('Accuracy of the training data :', (ac_train_svm)*100.0)

Accuracy of the training data : 77.11978465679678


In [1031]:
# Accuracy of the model using support vector machines

from sklearn.metrics import accuracy_score

ac_test_svm = accuracy_score(pred_y_test_svm, Y_test)

print('Accuracy of the training data :', (ac_test_svm)*100.0)

Accuracy of the training data : 84.0


In [1032]:
# Prediction System

input_data = (1 , 189, 60, 23, 846, 30.1, 0.398, 59)
#(3	, 78, 50, 32, 88, 31, 0.248, 26)
# convert the input data into numpy array
input_data_nparray = np.asarray(input_data)

print(input_data_nparray)

# Reshape the input data into 1-D 

reshape_data_nparray = input_data_nparray.reshape(1,-1)

print(reshape_data_nparray)




[1.00e+00 1.89e+02 6.00e+01 2.30e+01 8.46e+02 3.01e+01 3.98e-01 5.90e+01]
[[1.00e+00 1.89e+02 6.00e+01 2.30e+01 8.46e+02 3.01e+01 3.98e-01 5.90e+01]]


In [1033]:
# predict

prediction = model.predict(reshape_data_nparray)

print(prediction)

[1]




In [1034]:
if prediction == 1:
    print('this person has a diabetes')
else:
    print("this person has not diabetes")

this person has a diabetes
