In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.linear_model import LinearRegression

In [2]:
housing = fetch_california_housing()
x, y = housing.data, housing.target

In [3]:
print(f"Mean: {x.mean(axis=0)}")
print(f"Std: {x.std(axis=0)}") 

Mean: [ 3.87067100e+00  2.86394864e+01  5.42899974e+00  1.09667515e+00
  1.42547674e+03  3.07065516e+00  3.56318614e+01 -1.19569704e+02]
Std: [1.89977569e+00 1.25852527e+01 2.47411320e+00 4.73899376e-01
 1.13243469e+03 1.03857980e+01 2.13590065e+00 2.00348319e+00]


**Standardization**
- Mean 0, std 1

In [4]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

print(f"Mean: {x_scaled.mean(axis=0)}")  # Should be ~0
print(f"Std: {x_scaled.std(axis=0)}")    # Should be ~1

Mean: [ 5.50808322e-17  4.40646658e-17  7.71131651e-17 -1.00522519e-16
 -1.10161664e-17  0.00000000e+00  2.24729795e-15 -8.60362599e-15]
Std: [1. 1. 1. 1. 1. 1. 1. 1.]


**Normalization**
- Scale to [0, 1] range.

In [5]:
scaler = MinMaxScaler()
x_normalized = scaler.fit_transform(x)

print(f"Min: {x_normalized.min(axis=0)}")  # Should be 0
print(f"Max: {x_normalized.max(axis=0)}")  # Should be 1

Min: [0. 0. 0. 0. 0. 0. 0. 0.]
Max: [1. 1. 1. 1. 1. 1. 1. 1.]


**Robust Scaling**
- Using median and IQR. Best for data with outliers.

In [6]:
scaler = RobustScaler()
x_robust = scaler.fit_transform(x)

print(f"Min: {x_robust.mean(axis=0)}")
print(f"Max: {x_robust.std(axis=0)}")

Min: [ 0.15407987 -0.0189744   0.12401522  0.51253279  0.27662766  0.29622715
  0.36292631 -0.28488244]
Max: [ 0.87151671  0.66238172  1.53512897  5.07131607  1.20728645 12.18247159
  0.56505308  0.52862353]


**MaxAbs Scaling**
- Scales to [-1, 1] range

In [7]:
scaler = MaxAbsScaler()
x_maxabs = scaler.fit_transform(x)

print(f"Min: {x_maxabs.min(axis=0)[:5]}")
print(f"Max: {x_maxabs.max(axis=0)[:5]}")

Min: [3.33264445e-02 1.92307692e-02 5.96264722e-03 9.78473581e-03
 8.40760047e-05]
Max: [1. 1. 1. 1. 1.]


**Scaling Comparison**

In [16]:
from sklearn.model_selection import cross_val_score

scalers = {
    'No Scaling': None,
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}
results = {}

for name, scaler in scalers.items():
    if scaler is None:
        x_model = x
    else:
        x_model = scaler.fit_transform(x)
    
    model = LinearRegression()
    scores = cross_val_score(model, x_model, y, cv=5, scoring='neg_root_mean_squared_error')
    results[name] = scores.mean()
    print(f"{name:20s}: {scores.mean():.3f} (+/- {scores.std():.3f})")

No Scaling          : -0.746 (+/- 0.044)
StandardScaler      : -0.746 (+/- 0.044)
MinMaxScaler        : -0.746 (+/- 0.044)
RobustScaler        : -0.746 (+/- 0.044)
