In [25]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

# import data and shuffle it
data = pd.read_csv('../data/Pima_Indian_diabetes.csv');
data = data.sample(frac=1, random_state=15).reset_index(drop=True)
print(data.shape)
print(data.columns)

(768, 9)
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [26]:
# replacing with median of that column
for col in data.columns:
    if(col != 'Outcome'):
        data[col].fillna(data[col].median(), inplace=True)

In [27]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.837263,119.88347,68.886078,20.38694,79.799479,31.715288,0.471876,33.643542,0.348958
std,3.424073,32.033292,19.427448,15.750158,115.244002,8.483365,0.331329,12.166668,0.476951
min,-5.412815,0.0,-3.496455,-11.94552,0.0,-16.288921,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.275,0.24375,24.0,0.0
50%,3.0,116.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,139.0,80.0,32.0,127.25,36.425,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [28]:
# putting constraints on column values to remove noisy data
data = data[(data['Pregnancies'] >= 0) & (data['BloodPressure'] > 0) & (data['SkinThickness'] > 0) & (data['BMI'] > 0)]
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0
mean,3.542454,118.958363,71.40607,28.735203,113.216015,32.867959,0.500972,32.000917,0.337058
std,3.283201,32.909968,12.499787,10.509377,123.165198,6.955179,0.344106,11.105318,0.473146
min,0.0,0.0,15.372031,7.0,0.0,9.639841,0.085,21.0,0.0
25%,1.0,98.0,64.0,22.0,0.0,27.9,0.259,24.0,0.0
50%,3.0,115.0,71.604032,28.0,88.0,32.8,0.412,28.0,0.0
75%,5.0,139.0,80.0,36.0,165.0,36.9,0.658,38.0,1.0
max,17.0,199.0,110.0,99.0,846.0,67.1,2.42,81.0,1.0


In [29]:
x = data.iloc[:, :-1]
Y = data['Outcome']

In [30]:
# ensuring that datatype of all columns is float64
x.dtypes

Pregnancies                 float64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
dtype: object

In [31]:
# scale the feature columns
cols = x.columns
# using minmax scaler as units of feature columns are different and this brings them between 0 and 1
mm_scaler = MinMaxScaler()           
x = mm_scaler.fit_transform(x)
x = pd.DataFrame(x, columns=cols)
x.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0
mean,0.20838,0.597781,0.592151,0.236252,0.133825,0.404247,0.178146,0.183349
std,0.193129,0.165377,0.132094,0.114232,0.145585,0.121044,0.147369,0.185089
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.058824,0.492462,0.513886,0.163043,0.0,0.317788,0.074518,0.05
50%,0.176471,0.577889,0.594243,0.228261,0.104019,0.403065,0.140043,0.116667
75%,0.294118,0.698492,0.682969,0.315217,0.195035,0.474418,0.245396,0.283333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(x, Y, test_size=0.3, random_state=15, shuffle=True)