##	Feature	Scaling	- Normalization	&	Standardization

In [1]:
import pandas as pd

In [2]:
data = {
    'Age': [25, 40, 65, 30, 55],
    'Salary': [50000, 80000, 150000, 60000, 120000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary
0,25,50000
1,40,80000
2,65,150000
3,30,60000
4,55,120000


# Normalization
- calculate min and max value of particular column

norm = (value - min) / (max -  min)

In [3]:
age_min = df['Age'].min()
age_max = df['Age'].max()
age_min, age_max

(25, 65)

In [4]:
# Normalized age column
df['Age_norm'] = (df['Age'] - age_min) / (age_max - age_min)
df

Unnamed: 0,Age,Salary,Age_norm
0,25,50000,0.0
1,40,80000,0.375
2,65,150000,1.0
3,30,60000,0.125
4,55,120000,0.75


In [6]:
# Normalized salary column
salary_min = df['Salary'].min()
salary_max = df['Salary'].max()
df['Salary_norm'] = (df['Salary'] - salary_min) / (salary_max - salary_min)
df

Unnamed: 0,Age,Salary,Age_norm,Salary_norm
0,25,50000,0.0,0.0
1,40,80000,0.375,0.3
2,65,150000,1.0,1.0
3,30,60000,0.125,0.1
4,55,120000,0.75,0.7


In [7]:
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[['Age_norm_sk', 'Salary_norm_sk']] = scaler.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Age,Salary,Age_norm,Salary_norm,Age_norm_sk,Salary_norm_sk
0,25,50000,0.0,0.0,0.0,0.0
1,40,80000,0.375,0.3,0.375,0.3
2,65,150000,1.0,1.0,1.0,1.0
3,30,60000,0.125,0.1,0.125,0.1
4,55,120000,0.75,0.7,0.75,0.7


## Standardization
norm = (value - mean) /  std

In [9]:
data = {
    'Age': [25, 40, 65, 30, 55],
    'Salary': [50000, 80000, 150000, 60000, 120000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary
0,25,50000
1,40,80000
2,65,150000
3,30,60000
4,55,120000


In [11]:
age_mean

43.0

In [10]:
# age 
age_mean = df['Age'].mean()
age_std = df['Age'].std()

df['Age_std'] = (df['Age'] - age_mean) / age_std
df

Unnamed: 0,Age,Salary,Age_std
0,25,50000,-1.070935
1,40,80000,-0.178489
2,65,150000,1.308921
3,30,60000,-0.773453
4,55,120000,0.713957


In [12]:
df['Age'].mean(), df['Age_std'].mean()

(43.0, 0.0)

In [13]:
# age 
Salary_mean = df['Salary'].mean()
Salary_std = df['Salary'].std()

df['Salary_std'] = (df['Salary'] - Salary_mean) / Salary_std
df

Unnamed: 0,Age,Salary,Age_std,Salary_std
0,25,50000,-1.070935,-0.998304
1,40,80000,-0.178489,-0.28523
2,65,150000,1.308921,1.37861
3,30,60000,-0.773453,-0.760612
4,55,120000,0.713957,0.665536


In [14]:
df['Salary'].mean(), df['Salary_std'].mean()

(92000.0, -2.2204460492503132e-17)

In [None]:
# -2.2204460492503132e-17 = -0.00000000000000002220446049

In [16]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

df[['Age_std_sk', 'Salary_std_sk']] = std_scaler.fit_transform(df[['Age', 'Salary']])
df

Unnamed: 0,Age,Salary,Age_std,Salary_std,Age_std_sk,Salary_std_sk
0,25,50000,-1.070935,-0.998304,-1.197342,-1.116137
1,40,80000,-0.178489,-0.28523,-0.199557,-0.318896
2,65,150000,1.308921,1.37861,1.463418,1.541333
3,30,60000,-0.773453,-0.760612,-0.864747,-0.85039
4,55,120000,0.713957,0.665536,0.798228,0.744092


In [17]:
df['Salary'].mean(), df['Salary_std_sk'].mean()

(92000.0, 0.0)

[Skewness Transformation](https://anatomisebiostats.com/biostatistics-blog/transforming-skewed-data/)

In [19]:
import math

math.sqrt(5) * (-1)

-2.23606797749979