In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer

In [2]:
pima_df = pd.read_csv("raw_data\diabetes.csv")

In [3]:
pima_df.shape

(768, 9)

In [4]:
pima_df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
pima_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# scale data (between 0 and 1)

# Split the data into features and target
X = pima_df.drop('Outcome', axis=1)
y = pima_df['Outcome']

In [7]:
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [10]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [11]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_X = scaler.fit_transform(X)

In [15]:
print(rescaled_X[0:5,:])

[[0.35294118 0.74371859 0.59016393 0.35353535 0.         0.50074516
  0.23441503 0.48333333]
 [0.05882353 0.42713568 0.54098361 0.29292929 0.         0.39642325
  0.11656704 0.16666667]
 [0.47058824 0.91959799 0.52459016 0.         0.         0.34724292
  0.25362938 0.18333333]
 [0.05882353 0.44723618 0.54098361 0.23232323 0.11111111 0.41877794
  0.03800171 0.        ]
 [0.         0.68844221 0.32786885 0.35353535 0.19858156 0.64232489
  0.94363792 0.2       ]]


In [19]:
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaled_X = scaler.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(rescaled_X[0:5,:])

[[ 0.64  0.85  0.15  0.91 -0.69  0.2   0.47  1.43]
 [-0.84 -1.12 -0.16  0.53 -0.69 -0.68 -0.37 -0.19]
 [ 1.23  1.94 -0.26 -1.29 -0.69 -1.1   0.6  -0.11]
 [-0.84 -1.   -0.16  0.15  0.12 -0.49 -0.92 -1.04]
 [-1.14  0.5  -1.5   0.91  0.77  1.41  5.48 -0.02]]


In [22]:
# normalize data (length of 1)
scaler = Normalizer().fit(X)
normalized_X = scaler.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(normalized_X[0:5,:])

[[0.03 0.83 0.4  0.2  0.   0.19 0.   0.28]
 [0.01 0.72 0.56 0.24 0.   0.22 0.   0.26]
 [0.04 0.92 0.32 0.   0.   0.12 0.   0.16]
 [0.01 0.59 0.44 0.15 0.62 0.19 0.   0.14]
 [0.   0.6  0.17 0.15 0.73 0.19 0.01 0.14]]


In [24]:
# binarization (thresholding)
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(binary_X[0:5,:])

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]
