In [1]:
#importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
data = pd.read_csv("pima-data.csv")  #Reading the file

In [3]:
data.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


In [4]:
data.shape

(768, 10)

In [5]:
data.isnull().sum()  #Checking if there are any NULL values

num_preg        0
glucose_conc    0
diastolic_bp    0
thickness       0
insulin         0
bmi             0
diab_pred       0
age             0
skin            0
diabetes        0
dtype: int64

In [6]:
data.dtypes  #Datatypes of each column 

num_preg          int64
glucose_conc      int64
diastolic_bp      int64
thickness         int64
insulin           int64
bmi             float64
diab_pred       float64
age               int64
skin            float64
diabetes           bool
dtype: object

In [7]:
diabetes_map = {True: 1, False: 0}     #changing boolean values to 0 and 1

In [8]:
data['diabetes'] = data['diabetes'].map(diabetes_map)

In [9]:
data.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,1
1,1,85,66,29,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,0,23.3,0.672,32,0.0,1
3,1,89,66,23,94,28.1,0.167,21,0.9062,0
4,0,137,40,35,168,43.1,2.288,33,1.379,1


In [10]:
#checking how many zeroes in each column
print("total number of rows : {0}".format(len(data)))
print("number of rows missing glucose_conc: {0}".format(len(data.loc[data['glucose_conc'] == 0])))
print("number of rows missing diastolic_bp: {0}".format(len(data.loc[data['diastolic_bp'] == 0])))
print("number of rows missing insulin: {0}".format(len(data.loc[data['insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(data.loc[data['bmi'] == 0])))
print("number of rows missing thickness: {0}".format(len(data.loc[data['thickness'] == 0])))
print("number of rows missing diab_pred: {0}".format(len(data.loc[data['diab_pred'] == 0])))
print("number of rows missing age: {0}".format(len(data.loc[data['age'] == 0])))
print("number of rows missing skin: {0}".format(len(data.loc[data['skin'] == 0])))

total number of rows : 768
number of rows missing glucose_conc: 5
number of rows missing diastolic_bp: 35
number of rows missing insulin: 374
number of rows missing bmi: 11
number of rows missing thickness: 227
number of rows missing diab_pred: 0
number of rows missing age: 0
number of rows missing skin: 227


In [11]:
#Replacing Zeroes with mean of that column
data['glucose_conc']=data['glucose_conc'].replace(0,data['glucose_conc'].mean())
data['diastolic_bp']=data['diastolic_bp'].replace(0,data['diastolic_bp'].mean())
data['insulin']=data['insulin'].replace(0,data['insulin'].mean())
data['bmi']=data['bmi'].replace(0,data['bmi'].mean())
data['skin']=data['skin'].replace(0,data['skin'].mean())
data['thickness']=data['thickness'].replace(0,data['thickness'].mean())

In [12]:
data.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1.379,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,1.1426,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,0.809136,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0.9062,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1.379,1


In [13]:
target_name='diabetes'
q=data[target_name]
p=data.drop(target_name,axis=1)

In [14]:
p.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1.379
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,1.1426
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,0.809136
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0.9062
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1.379


In [15]:
q.head()

0    1
1    0
2    1
3    0
4    1
Name: diabetes, dtype: int64

In [16]:
#preprocessing the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(p)
SSX=scaler.transform(p)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_tr,x_te,y_tr,y_te = train_test_split(p,q,test_size=0.30)  #Splitting the data into Train and Test data
print(x_tr.shape)
print(x_te.shape)
print(y_tr.shape)
print(y_te.shape)

(537, 9)
(231, 9)
(537,)
(231,)


In [19]:
from sklearn.ensemble import RandomForestClassifier  #Implementing Random Forest Classification

In [20]:
m_4=RandomForestClassifier(n_estimators=80,criterion='entropy',max_depth=6,min_samples_split=15)
m_4.fit(x_tr,y_tr)

RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_split=15,
                       n_estimators=80)

In [21]:
ypred_m_4 = m_4.predict(x_te)
print(ypred_m_4)
print('Training score',m_4.score(x_tr,y_tr))
print('Testing score',m_4.score(x_te,y_te)) 

[1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 1 0 0 0 1 0]
Training score 0.8603351955307262
Testing score 0.7662337662337663


In [23]:
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
cm_m_4 = confusion_matrix(y_te,ypred_m_4)
print(cm_m_4)
print(classification_report(y_te,ypred_m_4))

[[133  16]
 [ 38  44]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       149
           1       0.73      0.54      0.62        82

    accuracy                           0.77       231
   macro avg       0.76      0.71      0.73       231
weighted avg       0.76      0.77      0.76       231

