In [40]:
# all imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

# import data and shuffle it
data = pd.read_csv('../data/Pima_Indian_diabetes.csv');
data = data.sample(frac=1, random_state=15).reset_index(drop=True)
print(data.shape)
print(data.columns)

# split feature and label columns
x = data.iloc[:, :-1]
x_without_preg = data.iloc[:, 1:-1]
Y = data['Outcome']
data.head()
# x_without_preg.head()

(768, 9)
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5.0,122.0,86.0,0.0,0.0,34.7,0.29,33.0,0
1,2.0,175.0,88.0,0.0,0.0,22.9,0.326,22.0,0
2,4.0,129.0,86.0,,270.0,35.1,0.231,23.0,0
3,12.0,,62.0,7.0,258.0,27.6,0.926,44.0,1
4,3.0,102.0,44.0,20.0,94.0,30.8,0.4,26.0,0


In [41]:
# replacing with median of that column
for col in x.columns:
    x[col].fillna(x[col].median(), inplace=True)

In [42]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,5.0,122.0,86.0,0.0,0.0,34.7,0.29,33.0
1,2.0,175.0,88.0,0.0,0.0,22.9,0.326,22.0
2,4.0,129.0,86.0,23.0,270.0,35.1,0.231,23.0
3,12.0,116.0,62.0,7.0,258.0,27.6,0.926,44.0
4,3.0,102.0,44.0,20.0,94.0,30.8,0.4,26.0


In [43]:
# ensuring that datatype of all columns if float64
x.dtypes

Pregnancies                 float64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
dtype: object

In [44]:
# scale the feature columns
cols = x.columns
# using minmax scaler as units of feature columns are different and this brings them between 0 and 1
mm_scaler = MinMaxScaler()           
x = mm_scaler.fit_transform(x)
x = pd.DataFrame(x, columns=cols)
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.464592,0.613065,0.713139,0.10767,0.0,0.611459,0.090521,0.2
1,0.33074,0.879397,0.729076,0.10767,0.0,0.469954,0.105892,0.016667
2,0.419975,0.648241,0.713139,0.314979,0.319149,0.616256,0.065329,0.033333
3,0.776913,0.582915,0.521899,0.170764,0.304965,0.526316,0.362084,0.383333
4,0.375357,0.512563,0.378468,0.287939,0.111111,0.56469,0.137489,0.083333


In [45]:
x.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.412714,0.602429,0.57677,0.291426,0.094326,0.575667,0.168179,0.210726
std,0.152773,0.160971,0.154805,0.141963,0.136222,0.101733,0.141473,0.202778
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.286123,0.497487,0.521899,0.10767,0.0,0.522419,0.070773,0.05
50%,0.375357,0.582915,0.601582,0.314979,0.036052,0.579081,0.125747,0.133333
75%,0.509209,0.698492,0.665329,0.3961,0.150414,0.632145,0.234095,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(x, Y, test_size=0.3, random_state=15)