In [1]:
import numpy as np
import scipy as sp
import pandas as pd

In [64]:
import plotly.express as xp
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 1. Rescale data

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = np.random.randint(0,100,size = (20,4))

In [5]:
X = data[:,:3]
Y = data[:,-1]

In [6]:
X[:5,:]

array([[91, 22, 62],
       [46,  3, 70],
       [20,  9, 38],
       [87, 47, 69],
       [54, 88, 22]])

In [10]:
Y[:5]

array([67, 19, 77, 70, 47])

In [12]:
scaler = MinMaxScaler(feature_range=(0, 1))

1. Compute the minimum and maximum to be used for later scaling.

In [14]:
scaler.fit(X,Y)

2. Fit to data, then transform it

In [16]:
rescaled_X = scaler.fit_transform(X,Y)

In [30]:
fig = make_subplots(rows = 1, cols = 2, x_title= "Data", y_title= "Dispression",
                    column_titles=['Data','Scaled Data'], specs = [[{'type':'xy'}, {'type':'xy'}]])

fig.add_scatter(y = X[:,0], name = "Row Data X[:,0]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,1], name = "Row Data X[:,1]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,2], name = "Row Data X[:,2]", mode='markers', row=1, col = 1)
fig.add_scatter(y = rescaled_X[:,0], name = 'Scaled Data X[:,0]', mode='markers', row=1, col = 2)
fig.add_scatter(y = rescaled_X[:,1], name = 'Scaled Data X[:,1]', mode='markers', row=1, col = 2)
fig.add_scatter(y = rescaled_X[:,2], name = 'Scaled Data X[:,2]', mode='markers', row=1, col = 2)

fig.update_layout(dict(title = "Data Rescaling" ))

fig.show()

In [35]:
np.mean(X, axis = 0)

array([55.8 , 48.9 , 44.65])

In [36]:
np.mean(rescaled_X, axis = 0)

array([0.55106383, 0.51      , 0.5077381 ])

In [39]:
np.std(X, axis = 0, ddof = 1)

array([32.65545816, 29.80974761, 27.54188198])

In [40]:
np.std(rescaled_X, axis =0, ddof = 1)

array([0.34739849, 0.33121942, 0.32787955])

# 2. Standardize Data

In [41]:
from sklearn.preprocessing import StandardScaler

In [42]:
scaler = StandardScaler()

In [43]:
scaler.fit(X,Y)

In [44]:
scaler.mean_

array([55.8 , 48.9 , 44.65])

In [45]:
scaler.var_

array([1013.06  ,  844.19  ,  720.6275])

In [46]:
std_X = scaler.fit_transform(X,Y)

In [48]:
fig = make_subplots(rows = 1, cols = 2, x_title= "Data", y_title= "Dispression",
                    column_titles=['Data','Standardized Data'], specs = [[{'type':'xy'}, {'type':'xy'}]])

fig.add_scatter(y = X[:,0], name = "Row Data X[:,0]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,1], name = "Row Data X[:,1]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,2], name = "Row Data X[:,2]", mode='markers', row=1, col = 1)
fig.add_scatter(y = std_X[:,0], name = 'Standardized Data X[:,0]', mode='markers', row=1, col = 2)
fig.add_scatter(y = std_X[:,1], name = 'Standardized Data X[:,1]', mode='markers', row=1, col = 2)
fig.add_scatter(y = std_X[:,2], name = 'Standardized Data X[:,2]', mode='markers', row=1, col = 2)

fig.update_layout(dict(title = "Data Standardization" ))

fig.show()

In [49]:
np.mean(std_X, axis = 0)

array([1.04083409e-16, 4.44089210e-17, 5.55111512e-17])

In [50]:
np.std(std_X, axis = 0, ddof = 1)

array([1.02597835, 1.02597835, 1.02597835])

# 3. Normalize Data

In [51]:
from sklearn.preprocessing import Normalizer

In [65]:
scaler = Normalizer()

In [66]:
scaler.fit(X,Y)

In [67]:
norm_X = scaler.fit_transform(X,Y)

In [68]:
fig = make_subplots(rows = 1, cols = 2, x_title= "Data", y_title= "Dispression",
                    column_titles=['Data','Normalized Data'], specs = [[{'type':'xy'}, {'type':'xy'}]])

fig.add_scatter(y = X[:,0], name = "Row Data X[:,0]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,1], name = "Row Data X[:,1]", mode='markers', row=1, col = 1)
fig.add_scatter(y = X[:,2], name = "Row Data X[:,2]", mode='markers', row=1, col = 1)
fig.add_scatter(y = norm_X[:,0], name = 'Normalized Data X[:,0]', mode='markers', row=1, col = 2)
fig.add_scatter(y = norm_X[:,1], name = 'Normalized  Data X[:,1]', mode='markers', row=1, col = 2)
fig.add_scatter(y = norm_X[:,2], name = 'Normalized Data X[:,2]', mode='markers', row=1, col = 2)

fig.update_layout(dict(title = "Data Normalization" ))

fig.show()

In [69]:
np.mean(norm_X, axis = 0)

array([0.56120724, 0.51023092, 0.45779968])

In [70]:
np.std(norm_X, axis = 0, ddof = 1)

array([0.2545493 , 0.30004355, 0.26764055])

# 4. Binarize Data

In [58]:
from sklearn.preprocessing import Binarizer

In [59]:
scaler = Binarizer()

In [60]:
scaler.fit(X,Y)

In [62]:
bin_X = scaler.fit_transform(X,Y)

In [63]:
bin_X

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])