In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv", index_col="id")

The idea of this notebook is to check if rescaling with standardization (mean centering and variance rescaling) or normalization (rescaling in the 0-1 range) does implies changes in the way features relate to each other (correlation).

Actually, do to rounding operations with floats, there are some slight difference that might lead to different models and results.

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sscaler = StandardScaler()
mmscaler = MinMaxScaler()

original = train.values.astype(np.float64)
standardized = sscaler.fit_transform(original)
normalized = mmscaler.fit_transform(original)

mat_original = np.corrcoef(original, rowvar=False)
mat_std = np.corrcoef(standardized, rowvar=False)
mat_norm = np.corrcoef(normalized, rowvar=False)

delta_std = np.max(mat_original - mat_std)
delta_norm = np.max(mat_original - mat_norm)

print(f"Max difference in correlation due to standardization: {delta_std}")
print(f"Max difference in correlation due to normalization: {delta_norm}")

Max difference in correlation due to standardization: 2.220446049250313e-15
Max difference in correlation due to normalization: 1.2212453270876722e-15


If previously to rescaling we downsize the datatype to float32, the max change is even larger.

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sscaler = StandardScaler()
mmscaler = MinMaxScaler()

original = train.values.astype(np.float32)
standardized = sscaler.fit_transform(original)
normalized = mmscaler.fit_transform(original)

mat_original = np.corrcoef(original, rowvar=False)
mat_std = np.corrcoef(standardized, rowvar=False)
mat_norm = np.corrcoef(normalized, rowvar=False)

delta_std = np.max(mat_original - mat_std)
delta_norm = np.max(mat_original - mat_norm)

print(f"Max difference in correlation due to standardization: {delta_std}")
print(f"Max difference in correlation due to normalization: {delta_norm}")

Max difference in correlation due to standardization: 1.4132742143235788e-08
Max difference in correlation due to normalization: 4.93559638714558e-08
