In [13]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

Preliminary:

- Import the California Housing dataset and split it into a train set and a test set (10%). Fit a linear regression on the dataset. _The goal is to focus on the metrics, that is why the code to fit the Linear Regression is given._


In [14]:
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)


1. Create 5 pipelines with 5 different models as final estimator (keep the imputer and scaler unchanged):
   1. Linear Regression
   2. SVM
   3. Decision Tree (set `random_state=43`)
   4. Random Forest (set `random_state=43`)
   5. Gradient Boosting (set `random_state=43`)

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(random_state=43),
    "Random Forest": RandomForestRegressor(random_state=43),
    "Gradient Boosting": GradientBoostingRegressor(random_state=43)
}

fitted_pipelines = {}

for name, model in models.items():
    p = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        (name, model)
    ])  
    p.fit(X_train, y_train)
    fitted_pipelines[name] = p

In [16]:
results = []

for name, pipe in fitted_pipelines.items():
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
    

    metrics = {
        "Model": name,
        "Train R2": r2_score(y_train, y_train_pred),
        "Test R2": r2_score(y_test, y_test_pred),
        "Train MSE": mean_squared_error(y_train, y_train_pred),
        "Test MSE": mean_squared_error(y_test, y_test_pred),
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Test MAE": mean_absolute_error(y_test, y_test_pred)
    }
    results.append(metrics)

df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))

            Model  Train R2  Test R2    Train MSE  Test MSE    Train MAE  Test MAE
Linear Regression  0.605413 0.612896 5.273648e-01  0.497612 5.330920e-01  0.519642
              SVM  0.749611 0.729508 3.346448e-01  0.347710 3.835645e-01  0.389768
    Decision Tree  1.000000 0.641135 9.287461e-32  0.461311 4.212345e-17  0.433923
    Random Forest  0.974142 0.812747 3.455860e-02  0.240709 1.198903e-01  0.319358
Gradient Boosting  0.804209 0.789508 2.616749e-01  0.270582 3.565654e-01  0.364554
