# Data normalization (Pima Indians Diabetes Database)

In [14]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv("data_task_3/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
df.isin([0]).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

### As we can see, a lot of values in our dataset missing values (zeros). To fix this, we can fill the missing values with the mean of columns.

In [17]:
cols_to_fill = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in cols_to_fill:
    mean = df[col].mean()
    df.loc[df[col] == 0, col] = mean

In [18]:
df.isin([0]).sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

### Now, we can also normalize the data using min-max normalization

In [21]:
df = (df - df.mean()) / (df.max() - df.min())
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.099951,0.169796,-0.0026,0.091234,-0.046708,0.023501,0.066236,0.279319,0.651042
1,-0.212549,-0.236656,-0.063825,0.026017,-0.046708,-0.119648,-0.051612,-0.037348,-0.348958
2,0.224951,0.395603,-0.084233,-0.065978,-0.046708,-0.187133,0.08545,-0.020681,0.651042
3,-0.212549,-0.210849,-0.063825,-0.039201,-0.02964,-0.088974,-0.130178,-0.204015,-0.348958
4,-0.034733,0.098828,-0.329131,0.091234,0.059303,0.217775,0.775458,-0.004015,0.651042
