<a href="https://colab.research.google.com/github/marcelounb/ML-Mastery-with-Python-Course/blob/master/chap7_Prepare_Your_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import cv2
from cv2 import cvtColor
from google.colab.patches import cv2_imshow
import csv

from pandas import read_csv 
from numpy import set_printoptions 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from numpy import set_printoptions

In [0]:
filename = '/content/diabetes_moddd.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 

# separate array into input and output components 
X = array[:,0:8] 
Y = array[:,8] 

In [7]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

# Rescale data

In [0]:
# When your data is comprised of attributes with varying scales, many machine learning algorithms can beneﬁt from rescaling
# the attributes to all have the same scale.
# Often this is referred to as normalization and attributes are often rescaled into the range between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1)) 
rescaledX = scaler.fit_transform(X) 
# summarize transformed data 
set_printoptions(precision=3)

In [10]:
rescaledX 

array([[0.353, 0.744, 0.59 , ..., 0.501, 0.234, 0.483],
       [0.059, 0.427, 0.541, ..., 0.396, 0.117, 0.167],
       [0.471, 0.92 , 0.525, ..., 0.347, 0.254, 0.183],
       ...,
       [0.294, 0.608, 0.59 , ..., 0.39 , 0.071, 0.15 ],
       [0.059, 0.633, 0.492, ..., 0.449, 0.116, 0.433],
       [0.059, 0.467, 0.574, ..., 0.453, 0.101, 0.033]])

# Standardize data

In [14]:
# Standardization is a useful technique to transform attributes with a Gaussian distribution 
# and diﬀering means and standard deviations to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1.
#  It is most suitable for techniques that assume a Gaussian distribution in the input variables and 
# work better with rescaled data, such as linear regression, logistic regression and linear discriminate analysis
scaler = StandardScaler().fit(X) 
rescaledX = scaler.transform(X) 
# summarize transformed data 
set_printoptions(precision=3) 
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [15]:
rescaledX[0:5,:]

array([[ 0.64 ,  0.848,  0.15 ,  0.907, -0.693,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161,  0.531, -0.693, -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, -1.288, -0.693, -1.103,  0.604, -0.106],
       [-0.845, -0.998, -0.161,  0.155,  0.123, -0.494, -0.921, -1.042],
       [-1.142,  0.504, -1.505,  0.907,  0.766,  1.41 ,  5.485, -0.02 ]])

# Normalize data

In [17]:
 # rescaling each observation (row) to have a length of 1 (called a unit norm or a vector with the length of 1 in linear algebra). 
 # This pre-processing method can be useful for sparse datasets (lots of zeros) with attributes of varying scales 
 # when using algorithms that weight input values such as neural networks and algorithms that use distance measures 
 # such as k-Nearest Neighbors. 
scaler = Normalizer().fit(X) 
normalizedX = scaler.transform(X) 
# summarize transformed data 
set_printoptions(precision=3) 
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [18]:
normalizedX[0:5,:]

array([[0.034, 0.828, 0.403, 0.196, 0.   , 0.188, 0.004, 0.28 ],
       [0.008, 0.716, 0.556, 0.244, 0.   , 0.224, 0.003, 0.261],
       [0.04 , 0.924, 0.323, 0.   , 0.   , 0.118, 0.003, 0.162],
       [0.007, 0.588, 0.436, 0.152, 0.622, 0.186, 0.001, 0.139],
       [0.   , 0.596, 0.174, 0.152, 0.731, 0.188, 0.01 , 0.144]])

# Binarize data

In [19]:
# You can transform your data using a binary threshold. All values above the threshold are marked 1
#  and all equal to or below are marked as 0.
# can be useful when you have probabilities that you want to make crisp values. It is also useful 
# when feature engineering and you want to add new features that indicate something meaningful.

binarizer = Binarizer(threshold=0.0).fit(X) 
binaryX = binarizer.transform(X) 
# summarize transformed data 
set_printoptions(precision=3) 
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [20]:
binaryX[0:5,:]

array([[1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 0., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1.]])