# Escolha de modelos e análise de erros

Nesta fase vamos fazer o seguinte:

- Construir uma *pipeline* de processamento dos dados;
- Elencar alguns modelos a serem analisados;
- Fazer mais uma separação de dados. Desta vez vamos dividir o conjunto de treino em dois: treino e validação;
- Escolher o melhor modelo baseado no erro no conjunto de validação;
- Analisar o *resíduo*: a diferença entre o valor predito e o valor real. Será que tem algum padrão residual aqui?

Primeiro, vamos carregar os dados pré-processados:

In [1]:
from pathlib import Path

import pandas as pd
from lab01.config import DATA_DIR
from lab01.dataloader import load_preprocessed_data

In [2]:
def load_X_y(data_dir: Path) -> tuple[pd.DataFrame, pd.Series]:
    data = load_preprocessed_data(DATA_DIR)
    X = data.drop(columns=['log_median_house_value'])
    y = data['log_median_house_value']
    return X, y


X, y = load_X_y(DATA_DIR)

Vamos agora dividir os dados em treino e teste, e dividir novamente em treino (final) e validação.

In [3]:
from sklearn.model_selection import train_test_split


def split_data(
    X,
    y,
    test_size=0.25,
    random_seed=42,
) -> tuple[
        pd.DataFrame,
        pd.DataFrame,
        pd.Series,
        pd.Series,
]:
    return train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_seed,
    )


X_train, X_test, y_train, y_test = split_data(X, y)
X_train_val, X_test_val, y_train_val, y_test_val = split_data(X_train, y_train)

In [4]:
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder, PolynomialFeatures,
                                   StandardScaler)

In [5]:
geo_cols = [
    'longitude',
    'latitude',
]

numerical_cols = [
    'housing_median_age',
    'log_households',
    'log_median_income',
    'log_rooms_per_household',
    'log_population_per_household',
    'log_bedrooms_per_room',
]

categorical_cols = [
    'ocean_proximity',
]

geo_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('cluster', KMeans(n_clusters=50)),
])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False)),
])

preprocessing_pipe = ColumnTransformer(
    transformers=[
        ('geo', geo_pipeline, geo_cols),
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols),
    ],
    remainder='passthrough',
)

preprocessing_pipe

In [6]:
preprocessing_pipe.fit(X_train_val)

preprocessing_pipe

## Escolha de modelos

In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', LinearRegression()),
])

In [8]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', DecisionTreeRegressor(random_state=42)),
])

In [9]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        RandomForestRegressor(
            random_state=42,
            n_jobs=-1,
        ),
    ),
])

In [10]:
from sklearn.ensemble import HistGradientBoostingRegressor

hist_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        HistGradientBoostingRegressor(
            random_state=42,
        ),
    ),
])

In [11]:
from sklearn.ensemble import ExtraTreesRegressor

et_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        ExtraTreesRegressor(
            random_state=42,
            n_jobs=-1,
        ),
    ),
])

In [12]:
from sklearn.dummy import DummyRegressor

dummy_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression', DummyRegressor(strategy='mean'),
    ),
])

In [13]:
from typing import Any

experiments: list[dict[str, Any]] = [
    {
        'name': 'Linear Regression',
        'model': lin_reg,
    },
    {
        'name': 'Decision Tree',
        'model': tree_reg,
    },
    {
        'name': 'Random Forest',
        'model': forest_reg
    },
    {
        'name': 'Histogram Gradient Boosting',
        'model': hist_reg,
    },
    {
        'name': 'Extra Trees',
        'model': et_reg,
    },
    {
        'name': 'Dummy',
        'model': dummy_reg,
    },    
]

#### Escolha de modelo: separação `treino_val` e `teste_val`

In [14]:
from time import perf_counter

from sklearn.metrics import root_mean_squared_error

for experiment in experiments:
    name = experiment['name']
    model = experiment['model']

    print(f'Model: {name}')
    print(f"\tTraining...")
    start_time = perf_counter()
    model.fit(X_train_val, y_train_val)
    end_time = perf_counter()
    elapsed_time = end_time - start_time
    experiment['elapsed_time_training'] = elapsed_time
    print(f"\tElapsed time: {elapsed_time:.2f}s for training")

    print(f"\n\tEvaluating...")
    start_time = perf_counter()
    y_pred_train = model.predict(X_train_val)
    y_pred_test = model.predict(X_test_val)
    end_time = perf_counter()
    elapsed_time = end_time - start_time
    experiment['elapsed_time_predict'] = elapsed_time
    print(f"\tElapsed time: {elapsed_time:.2f}s for prediction (train and test)")

    rmse_train = root_mean_squared_error(y_train_val, y_pred_train)
    rmse_test = root_mean_squared_error(y_test_val, y_pred_test)

    print(f"\n\tRMSE (train): {rmse_train:.5f}")
    print(f"\tRMSE (test): {rmse_test:.5f}")

    print("\n\n")

    experiment['rmse_train'] = rmse_train
    experiment['rmse_test'] = rmse_test

Model: Linear Regression
	Training...
	Elapsed time: 0.12s for training

	Evaluating...
	Elapsed time: 0.04s for prediction (train and test)

	RMSE (train): 0.09573
	RMSE (test): 0.10374



Model: Decision Tree
	Training...
	Elapsed time: 2.43s for training

	Evaluating...
	Elapsed time: 0.11s for prediction (train and test)

	RMSE (train): 0.00000
	RMSE (test): 0.12290



Model: Random Forest
	Training...
	Elapsed time: 11.74s for training

	Evaluating...
	Elapsed time: 0.39s for prediction (train and test)

	RMSE (train): 0.03314
	RMSE (test): 0.08705



Model: Histogram Gradient Boosting
	Training...
	Elapsed time: 0.87s for training

	Evaluating...
	Elapsed time: 0.09s for prediction (train and test)

	RMSE (train): 0.06509
	RMSE (test): 0.08493



Model: Extra Trees
	Training...
	Elapsed time: 2.64s for training

	Evaluating...
	Elapsed time: 0.39s for prediction (train and test)

	RMSE (train): 0.00000
	RMSE (test): 0.08927



Model: Dummy
	Training...
	Elapsed time: 0.06s for tr

In [15]:
results = [
    {
        'name': experiment['name'],
        'elapsed_time_training': experiment['elapsed_time_training'],
        'elapsed_time_predict': experiment['elapsed_time_predict'],
        'rmse_train': experiment['rmse_train'],
        'rmse_test': experiment['rmse_test'],
        'rmse_train_percent': 100.0 * (10.0**experiment['rmse_train'] - 1.0),
        'rmse_test_percent': 100.0 * (10.0**experiment['rmse_test'] - 1.0),
    }
    for experiment in experiments
]

results_df = pd.DataFrame(results)

In [16]:
results_df.round(4).set_index('name').transpose()

name,Linear Regression,Decision Tree,Random Forest,Histogram Gradient Boosting,Extra Trees,Dummy
elapsed_time_training,0.1176,2.4346,11.7411,0.8715,2.6361,0.0634
elapsed_time_predict,0.0419,0.1106,0.3858,0.0898,0.3891,0.0358
rmse_train,0.0957,0.0,0.0331,0.0651,0.0,0.2272
rmse_test,0.1037,0.1229,0.0871,0.0849,0.0893,0.2296
rmse_train_percent,24.6604,0.0,7.9284,16.1687,0.0,68.7355
rmse_test_percent,26.9807,32.7097,22.195,21.6001,22.8209,69.6591


#### Escolha de modelo: Validação cruzada

In [17]:
import numpy as np
from sklearn.model_selection import cross_val_score

In [18]:
cv = 5

results = []
for experiment in experiments:
    name = experiment['name']
    model = experiment['model']

    print(f'Model: {name}')
    print(f"\tCross-validating...")
    start_time = perf_counter()

    # Faz a validação cruzada.
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        scoring='neg_mean_squared_error',
        cv=cv,
    )

    end_time = perf_counter()
    elapsed_time = end_time - start_time
    print(f"\tElapsed time: {elapsed_time:.2f}s for cross-validation")

    rmse = [np.sqrt(-neg_mse) for neg_mse in scores]
    print(f"\tRMSE scores: {rmse}")
    print(f"\tMean RMSE: {np.mean(rmse):.5f}")
    print(f"\tStd RMSE: {np.std(rmse):.5f}")

    results.append(rmse)


Model: Linear Regression
	Cross-validating...
	Elapsed time: 0.62s for cross-validation
	RMSE scores: [np.float64(0.10657807022922046), np.float64(0.09706909654744904), np.float64(0.09764733065028684), np.float64(0.09862732812466382), np.float64(0.09567583767049576)]
	Mean RMSE: 0.09912
	Std RMSE: 0.00385
Model: Decision Tree
	Cross-validating...
	Elapsed time: 13.21s for cross-validation
	RMSE scores: [np.float64(0.12632749930100345), np.float64(0.12397684254556364), np.float64(0.1251910695480279), np.float64(0.12282840091078552), np.float64(0.12198062290389793)]
	Mean RMSE: 0.12406
	Std RMSE: 0.00157
Model: Random Forest
	Cross-validating...
	Elapsed time: 66.98s for cross-validation
	RMSE scores: [np.float64(0.09170015656590169), np.float64(0.08734720928122862), np.float64(0.08710425849708554), np.float64(0.08729001869117073), np.float64(0.08650985538998258)]
	Mean RMSE: 0.08799
	Std RMSE: 0.00188
Model: Histogram Gradient Boosting
	Cross-validating...
	Elapsed time: 4.33s for cross

In [19]:
results = 100*(10**np.array(results)  - 1)

In [20]:
results_df = pd.DataFrame(
    results.T,
    columns=[experiment['name'] for experiment in experiments],
)

In [21]:
results_df

Unnamed: 0,Linear Regression,Decision Tree,Random Forest,Histogram Gradient Boosting,Extra Trees,Dummy
0,27.813895,33.760382,23.509441,22.384155,23.91364,69.180556
1,25.045796,33.038348,22.277685,21.175349,22.803639,69.855782
2,25.212397,33.410825,22.209301,21.135821,22.581775,68.4121
3,25.495262,32.687008,22.261584,21.521196,22.594281,69.593315
4,24.64528,32.428245,22.042152,21.559513,22.493037,67.834872


In [22]:
results_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Linear Regression,5.0,25.642526,1.252211,24.64528,25.045796,25.212397,25.495262,27.813895
Decision Tree,5.0,33.064961,0.536702,32.428245,32.687008,33.038348,33.410825,33.760382
Random Forest,5.0,22.460033,0.594008,22.042152,22.209301,22.261584,22.277685,23.509441
Histogram Gradient Boosting,5.0,21.555207,0.502122,21.135821,21.175349,21.521196,21.559513,22.384155
Extra Trees,5.0,22.877274,0.590449,22.493037,22.581775,22.594281,22.803639,23.91364
Dummy,5.0,68.975325,0.83921,67.834872,68.4121,69.180556,69.593315,69.855782


In [23]:
from sklearn.dummy import DummyRegressor

pipe = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', DummyRegressor(strategy='mean')),
])

In [24]:
param_grid = [
    {
        'regression': [DummyRegressor()],
        'regression__strategy': ['mean', 'median'],
    },
    {
        'regression': [DecisionTreeRegressor(random_state=42)],
        'regression__max_depth': [5, 10, 15, 20, None],
    },
    {
        'regression': [HistGradientBoostingRegressor(random_state=42)],
        'regression__max_iter': [50, 100, 150],
        'preprocessing__geo__cluster__n_clusters': [10, 20, 30, 40, 50],
    },
]

In [25]:
from sklearn.model_selection import GridSearchCV

cv = 5

grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
)


In [26]:
grid_search.fit(X_train, y_train)

In [27]:
grid_search.best_params_

{'preprocessing__geo__cluster__n_clusters': 50,
 'regression': HistGradientBoostingRegressor(random_state=42),
 'regression__max_iter': 150}

In [28]:
100*(10**np.sqrt(-grid_search.best_score_) - 1)

np.float64(21.254114645225997)