Normalization

In [5]:
import sys, os
lib_path = os.path.abspath('../libs')
sys.path.append(lib_path)

In [6]:
from InputOptimization import InputOptimization

In [7]:
io = InputOptimization()

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [9]:
df = pd.read_csv('../00_Data/05_Encoded.csv')
df.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,1,2,102292,5,2,59.0
1,1,2,1,2,1,2,2,100411,5,3,4.0
2,0,2,1,1,1,1,2,102304,5,5,26.0
3,0,3,1,3,1,3,4,102324,5,7,126.0
4,0,2,11,2,1,2,3,100619,5,7,126.0


x + 2y + z + p + q + r + 2a + 102292b + 5c + 2d = 59.0

Scaling:

1. Standard Scaler | Z Scaler
2. Min-Max Scaler 
3. Robust Scaler

1. Standard Scaler

In [10]:
df['Manager ID'].values.reshape(-1, 1)

array([[102292],
       [100411],
       [102304],
       [102324],
       [100619],
       [100846],
       [102324],
       [100598],
       [100726],
       [100667],
       [101059],
       [101321],
       [101379],
       [102350],
       [100188],
       [100416],
       [100439],
       [101040],
       [100254],
       [101571],
       [100543],
       [100620],
       [100667],
       [100754],
       [101059],
       [101379],
       [101444],
       [101569],
       [102366],
       [100296],
       [101540],
       [100282],
       [102367],
       [102368],
       [100794],
       [100439],
       [100650],
       [102387],
       [101288],
       [101377],
       [101444],
       [101451],
       [100442],
       [100186],
       [100277],
       [102383],
       [101058],
       [101076],
       [101162],
       [101322],
       [101495],
       [101437],
       [101540],
       [100105],
       [100191],
       [100186],
       [100170],
       [100780],
       [100794

In [11]:
def std_scale(df:pd.DataFrame, columns:list):
    """Scale the numerical column using Standard Scaler"""

    df_std = df.copy()
    scaler = StandardScaler()

    for column in columns:
        df_std[column] = scaler.fit_transform(df_std[column].values.reshape(-1, 1))
        df_std[column] = df_std[column].apply(lambda x: round(x, 1))

    return df_std

In [12]:
df_std = std_scale(df, ['Manager ID'])
df_std.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,1,2,1.9,5,2,59.0
1,1,2,1,2,1,2,2,-1.0,5,3,4.0
2,0,2,1,1,1,1,2,1.9,5,5,26.0
3,0,3,1,3,1,3,4,1.9,5,7,126.0
4,0,2,11,2,1,2,3,-0.7,5,7,126.0


2. Min-Max Normalization

In [13]:
def min_max_scale(df:pd.DataFrame, columns:list):
    """Scaling numeric columns using Min-Max Normalization"""

    df_minmax = df.copy()
    scaler = MinMaxScaler()

    for column in columns:
        df_minmax[column] = scaler.fit_transform(df_minmax[column].values.reshape(-1, 1))
        df_minmax[column] = df_minmax[column].apply(lambda x: round(x, 1))

    return df_minmax


In [14]:
df_minmax = min_max_scale(df, ['Manager ID'])
df_minmax.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,1,2,0.8,5,2,59.0
1,1,2,1,2,1,2,2,0.1,5,3,4.0
2,0,2,1,1,1,1,2,0.8,5,5,26.0
3,0,3,1,3,1,3,4,0.9,5,7,126.0
4,0,2,11,2,1,2,3,0.2,5,7,126.0


3. Robust Scaler

In [15]:
def robust_scaler(df:pd.DataFrame, columns:list):
    """Scaling numeric columns using Robust Normalization"""

    df_robust = df.copy()
    scaler = RobustScaler()

    for column in columns:
        df_robust[column] = scaler.fit_transform(df_robust[column].values.reshape(-1, 1))
        df_robust[column] = df_robust[column].apply(lambda x: round(x, 1))

    return df_robust

In [16]:
df_robust = robust_scaler(df, ['Manager ID'])
df_robust.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,1,2,1.3,5,2,59.0
1,1,2,1,2,1,2,2,-0.7,5,3,4.0
2,0,2,1,1,1,1,2,1.3,5,5,26.0
3,0,3,1,3,1,3,4,1.3,5,7,126.0
4,0,2,11,2,1,2,3,-0.5,5,7,126.0


General Validation

In [17]:
# Profile Report
io.generate_profile_report(df_std, '../01_EDA/reports/', 'Normalized')

Summarize dataset: 100%|██████████| 56/56 [00:01<00:00, 37.76it/s, Completed]                               
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 240.15it/s]


In [18]:
# Validate Type
io.validate_types(df_std)

Unnamed: 0,Column,Data Type,Unique Values
10,active_months,numerical,130
9,Department,numerical,58
7,Manager ID,numerical,38
2,Term Sub Reason,numerical,16
6,Age Bucket,numerical,7
4,Location,numerical,6
8,Talent,numerical,5
1,Generation,numerical,4
3,Employee Grade,numerical,3
5,Career Bucket,numerical,3


In [19]:
df_std

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,1,2,1.9,5,2,59.0
1,1,2,1,2,1,2,2,-1.0,5,3,4.0
2,0,2,1,1,1,1,2,1.9,5,5,26.0
3,0,3,1,3,1,3,4,1.9,5,7,126.0
4,0,2,11,2,1,2,3,-0.7,5,7,126.0
...,...,...,...,...,...,...,...,...,...,...,...
547,0,2,1,1,1,2,2,1.7,5,58,53.0
548,0,3,11,2,6,3,5,2.5,5,15,133.0
549,0,2,1,1,1,2,2,0.9,5,21,39.0
550,0,2,1,1,1,2,2,0.7,5,21,48.0
