**Student:** Michele Cristina Otta

# Regression with Structured Data
Data Science Track -> Solve the California Housing Prices

In [102]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# load california housing prices dataset
california_housing = fetch_california_housing(as_frame=True)

x = california_housing.data # features
y = california_housing.target # target

print(x)
print(y)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [103]:
# use 5 fold cross-validation
k_folds = KFold(n_splits = 5)

In [104]:
comparison = {}
comparison['models'] = []
comparison['MSE'] = []
comparison['MAE'] = []

## Linear Regression and Decision Tree models



In [105]:
linear_regression = LinearRegression()
tree_regression = DecisionTreeRegressor()
models = {
    "Linear Regression": linear_regression,
    "Decision Tree": tree_regression}

for name, model in models.items():
  mse = []
  mae = []

  for train_index, test_index in k_folds.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(x_train, y_train)
    predict_scores = model.predict(x_test)

    mse.append(mean_squared_error(y_test, predict_scores))
    mae.append(mean_absolute_error(y_test, predict_scores))

  print(f"\n======== {name} ========")
  print(f"Mean Squared Error (MSE): {np.mean(mse):.2f} -> {mse}")
  print(f"Mean Absolute Error (MAE): {np.mean(mae):.2f} -> {mae}")

  comparison['models'].append(name)
  comparison['MSE'].append(np.mean(mse))
  comparison['MAE'].append(np.mean(mae))


Mean Squared Error (MSE): 0.56 -> [0.4848585674569719, 0.6224973867350158, 0.6462104728579943, 0.5431995961544829, 0.49468483563880783]
Mean Absolute Error (MAE): 0.55 -> [0.5459943859754653, 0.566178204708811, 0.5765495192042878, 0.5319061446661371, 0.5168526993787032]

Mean Squared Error (MSE): 0.80 -> [0.7936397176394138, 0.6925734895643169, 0.8343677688544816, 0.8918081731882994, 0.812175424781153]
Mean Absolute Error (MAE): 0.62 -> [0.6249870518410852, 0.5729455305232558, 0.6270194113372092, 0.6373912621124032, 0.6338807315891474]


## Linear Regression and Decision Tree with Power Transformers

In [106]:
# using power transform
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

print(x)
pt_x = pt.fit_transform(x)
print(pt_x)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [107]:
linear_regression = LinearRegression()
tree_regression = DecisionTreeRegressor()
models = {
    "Linear Regression + Power Transformer": linear_regression,
    "Decision Tree + Power Transformer": tree_regression}

for name, model in models.items():
  mse = []
  mae = []

  for train_index, test_index in k_folds.split(pt_x, y):
    x_train, x_test = pt_x[train_index], pt_x[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(x_train, y_train)
    predict_scores = model.predict(x_test)

    mse.append(mean_squared_error(y_test, predict_scores))
    mae.append(mean_absolute_error(y_test, predict_scores))

  print(f"\n======== {name} ========")
  print(f"Mean Squared Error (MSE): {np.mean(mse):.2f} -> {mse}")
  print(f"Mean Absolute Error (MAE): {np.mean(mae):.2f} -> {mae}")

  comparison['models'].append(name)
  comparison['MSE'].append(np.mean(mse))
  comparison['MAE'].append(np.mean(mae))


Mean Squared Error (MSE): 0.61 -> [0.5406814643245412, 0.5795036219601548, 0.6030139774453241, 0.6662908352657437, 0.6629242682498596]
Mean Absolute Error (MAE): 0.60 -> [0.581840907592451, 0.5593360245679709, 0.5805013686094037, 0.6592601423173615, 0.5974551484695337]

Mean Squared Error (MSE): 0.79 -> [0.8042189745927809, 0.6548308530083091, 0.8040464056709302, 0.9213804524857557, 0.7679365081194768]
Mean Absolute Error (MAE): 0.62 -> [0.6259728439922481, 0.5551138154069767, 0.630334777131783, 0.6613791472868216, 0.6169703779069767]


## Linear Regression and Decision Tree with Z-Scale

In [108]:
# using z-scale
from sklearn.preprocessing import StandardScaler

z_scaler = StandardScaler()

print(x)
z_x = z_scaler.fit_transform(x)
print(z_x)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [109]:
linear_regression = LinearRegression()
tree_regression = DecisionTreeRegressor()
models = {
    "Linear Regression + Z-scale": linear_regression,
    "Decision Tree + Z-scale": tree_regression}

for name, model in models.items():
  mse = []
  mae = []

  for train_index, test_index in k_folds.split(z_x, y):
    x_train, x_test = z_x[train_index], z_x[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(x_train, y_train)
    predict_scores = model.predict(x_test)

    mse.append(mean_squared_error(y_test, predict_scores))
    mae.append(mean_absolute_error(y_test, predict_scores))

  print(f"\n======== {name} ========")
  print(f"Mean Squared Error (MSE): {np.mean(mse):.2f} -> {mse}")
  print(f"Mean Absolute Error (MAE): {np.mean(mae):.2f} -> {mae}")

  comparison['models'].append(name)
  comparison['MSE'].append(np.mean(mse))
  comparison['MAE'].append(np.mean(mae))


Mean Squared Error (MSE): 0.56 -> [0.48485856745697514, 0.6224973867350154, 0.6462104728579944, 0.5431995961544843, 0.494684835638808]
Mean Absolute Error (MAE): 0.55 -> [0.5459943859754685, 0.5661782047088115, 0.5765495192042885, 0.5319061446661392, 0.5168526993787046]

Mean Squared Error (MSE): 0.83 -> [0.7750602750668604, 0.6832982415263081, 0.8217852197125969, 1.019164623741521, 0.8354064951106347]
Mean Absolute Error (MAE): 0.62 -> [0.6148116763565891, 0.5681938129844962, 0.6229000387596899, 0.6758620397286821, 0.6403024878875969]


## Linear Regression and Decision Tree with Min-Max Scale

In [110]:
# using min-max scale
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()

print(x)
mm_x = mm_scaler.fit_transform(x)
print(mm_x)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [111]:
linear_regression = LinearRegression()
tree_regression = DecisionTreeRegressor()
models = {
    "Linear Regression + Min-Max Scale": linear_regression,
    "Decision Tree + Min-Max Scale": tree_regression}

for name, model in models.items():
  mse = []
  mae = []

  for train_index, test_index in k_folds.split(mm_x, y):
    x_train, x_test = mm_x[train_index], mm_x[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(x_train, y_train)
    predict_scores = model.predict(x_test)

    mse.append(mean_squared_error(y_test, predict_scores))
    mae.append(mean_absolute_error(y_test, predict_scores))

  print(f"\n======== {name} ========")
  print(f"Mean Squared Error (MSE): {np.mean(mse):.2f} -> {mse}")
  print(f"Mean Absolute Error (MAE): {np.mean(mae):.2f} -> {mae}")

  comparison['models'].append(name)
  comparison['MSE'].append(np.mean(mse))
  comparison['MAE'].append(np.mean(mae))


Mean Squared Error (MSE): 0.56 -> [0.4848585674569737, 0.6224973867350136, 0.6462104728579886, 0.5431995961544843, 0.4946848356388081]
Mean Absolute Error (MAE): 0.55 -> [0.5459943859754675, 0.5661782047088127, 0.5765495192042872, 0.5319061446661437, 0.5168526993787047]

Mean Squared Error (MSE): 0.83 -> [0.7877766668886144, 0.7089209262796753, 0.7755831482054748, 1.0115795885770105, 0.8531999641269864]
Mean Absolute Error (MAE): 0.62 -> [0.6159802858527132, 0.5737163880813954, 0.6066507315891474, 0.6741428270348837, 0.6436673013565891]


## Final comparison and analysis

In [112]:
import pandas as pd
df = pd.DataFrame(comparison)
df.set_index('models', inplace=True)
display(df)

Unnamed: 0_level_0,MSE,MAE
models,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,0.55829,0.547496
Decision Tree,0.804913,0.619245
Linear Regression + Power Transformer,0.610483,0.595679
Decision Tree + Power Transformer,0.790483,0.617954
Linear Regression + Z-scale,0.55829,0.547496
Decision Tree + Z-scale,0.826943,0.624414
Linear Regression + Min-Max Scale,0.55829,0.547496
Decision Tree + Min-Max Scale,0.827412,0.622832


In [114]:
import pandas as pd
df = pd.DataFrame(comparison)
df.set_index('models', inplace=True)
display(df)

Unnamed: 0_level_0,MSE,MAE
models,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,0.55829,0.547496
Decision Tree,0.804913,0.619245
Linear Regression + Power Transformer,0.610483,0.595679
Decision Tree + Power Transformer,0.790483,0.617954
Linear Regression + Z-scale,0.55829,0.547496
Decision Tree + Z-scale,0.826943,0.624414
Linear Regression + Min-Max Scale,0.55829,0.547496
Decision Tree + Min-Max Scale,0.827412,0.622832


### Conclusion
**Linear Regression**
* With Z-scale and Min-Max Scaling the results remained similar to the original one, so possibly the original dataset was already scaled for Linear Regression.
* However, with Power Transformer, there was a significant deterioration in the results.
  * MSE = 0.56 -> 0.61
  * MAE = 0.55 -> 0.60

**Decision Tree**
* Decision Tree showed slightly improvement in MSE results with Power Transformer.
  * MSE = 0.80 -> 0.79
* With Z-scale and Min-Max Scaling there were minor differences (worsening the performance).

# References

* [The California housing dataset](https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html)
* [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)
* [PowerTransformer in scikit-learn](https://www.geeksforgeeks.org/powertransformer-in-scikit-learn/)
* [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
