## Normalization(min max scaling) and standardization(z-score scaling)

### sample dataset
#### using built in funtions

In [1]:
# imports and sample dataset
import pandas as pd

data = {
    'Height': [150, 160, 165, 170, 175, 180, 185],
    'weight': [50, 55, 60, 65, 70, 75, 80]
}

df = pd.DataFrame(data)
print(df)

   Height  weight
0     150      50
1     160      55
2     165      60
3     170      65
4     175      70
5     180      75
6     185      80


In [3]:
# normalizing using min-max
for column in df.columns[:2]:
    min = df[column].min()
    max = df[column].max()

    df[f"normalized- {column}"] = (df[column]-min)/(max-min)

# normalizing using mean and sd
for column in df.columns[:2]:
    mean = df[column].mean()
    std = df[column].std()

    df[f"standardized- {column}"] = (df[column]-mean)/std

print(df)

   Height  weight  normalized- Height  normalized- weight  \
0     150      50            0.000000            0.000000   
1     160      55            0.285714            0.166667   
2     165      60            0.428571            0.333333   
3     170      65            0.571429            0.500000   
4     175      70            0.714286            0.666667   
5     180      75            0.857143            0.833333   
6     185      80            1.000000            1.000000   

   standardized- Height  standardized- weight  
0             -1.600278              -1.38873  
1             -0.770504              -0.92582  
2             -0.355617              -0.46291  
3              0.059270               0.00000  
4              0.474156               0.46291  
5              0.889043               0.92582  
6              1.303930               1.38873  


#### using functions of library scikit learn 

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# taking all rows and first two columns
standardized = scaler.fit_transform(df.iloc[:,:2])
# creating a dataframe out of it
df_standardized = pd.DataFrame(standardized, columns=df.columns[:2])
print("\nStandardized DataFrame:")
print(df_standardized)


Standardized DataFrame:
     Height  weight
0 -1.728498    -1.5
1 -0.832240    -1.0
2 -0.384111    -0.5
3  0.064018     0.0
4  0.512148     0.5
5  0.960277     1.0
6  1.408406     1.5


In [5]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
normalized = normalizer.fit_transform(df.iloc[:,:2])

df_normalized = pd.DataFrame(normalized, columns=df.columns[:2])
print("\nNormalized DataFrame:")
print(df_normalized)


Normalized DataFrame:
     Height    weight
0  0.000000  0.000000
1  0.285714  0.166667
2  0.428571  0.333333
3  0.571429  0.500000
4  0.714286  0.666667
5  0.857143  0.833333
6  1.000000  1.000000


### Open - source dataset (boston housing dataset)

In [31]:
import pandas as pd

# Loading the dataset

df = pd.read_csv("./datasets/BostonHousing.csv")

print(df.head())

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print("\nStandardized DataFrame:")
print(df_standardized.head())


Standardized DataFrame:
       crim        zn     indus      chas       nox        rm       age  \
0 -0.419782  0.284830 -1.287909 -0.272599 -0.144217  0.413672 -0.120013   
1 -0.417339 -0.487722 -0.593381 -0.272599 -0.740262  0.194274  0.367166   
2 -0.417342 -0.487722 -0.593381 -0.272599 -0.740262  1.282714 -0.265812   
3 -0.416750 -0.487722 -1.306878 -0.272599 -0.835284  1.016303 -0.809889   
4 -0.412482 -0.487722 -1.306878 -0.272599 -0.835284  1.228577 -0.511180   

        dis       rad       tax   ptratio         b     lstat      medv  
0  0.140214 -0.982843 -0.666608 -1.459000  0.441052 -1.075562  0.159686  
1  0.557160 -0.867883 -0.987329 -0.303094  0.441052 -0.492439 -0.101524  
2  0.557160 -0.867883 -0.987329 -0.303094  0.396427 -1.208727  1.324247  
3  1.077737 -0.752922 -1.106115  0.113032  0.416163 -1.361517  1.182758  
4  1.077737 -0.752922 -1.106115  0.113032  0.441052 -1.026501  1.487503  


In [33]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
df_normalized = pd.DataFrame(normalizer.fit_transform(df), columns=df.columns)

print("\nNormalized DataFrame:")
print(df_normalized.head())


Normalized DataFrame:
       crim    zn     indus  chas       nox        rm       age       dis  \
0  0.000000  0.18  0.067815   0.0  0.314815  0.577505  0.641607  0.269203   
1  0.000236  0.00  0.242302   0.0  0.172840  0.547998  0.782698  0.348962   
2  0.000236  0.00  0.242302   0.0  0.172840  0.694386  0.599382  0.348962   
3  0.000293  0.00  0.063050   0.0  0.150206  0.658555  0.441813  0.448545   
4  0.000705  0.00  0.063050   0.0  0.150206  0.687105  0.528321  0.448545   

        rad       tax   ptratio         b     lstat      medv  
0  0.000000  0.208015  0.287234  1.000000  0.089680  0.422222  
1  0.043478  0.104962  0.553191  1.000000  0.204470  0.368889  
2  0.043478  0.104962  0.553191  0.989737  0.063466  0.660000  
3  0.086957  0.066794  0.648936  0.994276  0.033389  0.631111  
4  0.086957  0.066794  0.648936  1.000000  0.099338  0.693333  
