# Escolha de modelos e análise de erros

Nesta fase vamos fazer o seguinte:

- Construir uma *pipeline* de processamento dos dados;
- Elencar alguns modelos a serem analisados;
- Fazer mais uma separação de dados. Desta vez vamos dividir o conjunto de treino em dois: treino e validação;
- Escolher o melhor modelo baseado no erro no conjunto de validação;
- Analisar o *resíduo*: a diferença entre o valor predito e o valor real. Será que tem algum padrão residual aqui?

Primeiro, vamos carregar os dados pré-processados:

In [6]:
from pathlib import Path

import pandas as pd
from lab01.config import DATA_DIR
from lab01.dataloader import load_preprocessed_data

In [7]:
def load_X_y(data_dir: Path) -> tuple[pd.DataFrame, pd.Series]:
    data = load_preprocessed_data(DATA_DIR)
    X = data.drop(columns=['log_median_house_value'])
    y = data['log_median_house_value']
    return X, y


X, y = load_X_y(DATA_DIR)

Vamos agora dividir os dados em treino e teste, e dividir novamente em treino (final) e validação.

In [8]:
from sklearn.model_selection import train_test_split


def split_data(
    X,
    y,
    test_size=0.25,
    random_seed=42,
) -> tuple[
        pd.DataFrame,
        pd.DataFrame,
        pd.Series,
        pd.Series,
]:
    return train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_seed,
    )


X_train, X_test, y_train, y_test = split_data(X, y)
X_train_val, X_test_val, y_train_val, y_test_val = split_data(X_train, y_train)

In [9]:
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (OneHotEncoder, PolynomialFeatures,
                                   StandardScaler)

In [10]:
geo_cols = [
    'longitude',
    'latitude',
]

numerical_cols = [
    'housing_median_age',
    'log_households',
    'log_median_income',
    'log_rooms_per_household',
    'log_population_per_household',
    'log_bedrooms_per_room',
]

categorical_cols = [
    'ocean_proximity',
]

geo_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('cluster', KMeans(n_clusters=50)),
])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False)),
])

preprocessing_pipe = ColumnTransformer(
    transformers=[
        ('geo', geo_pipeline, geo_cols),
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols),
    ],
    remainder='passthrough',
)

preprocessing_pipe

In [11]:
preprocessing_pipe.fit(X_train_val)

preprocessing_pipe

## Escolha de modelos

In [12]:
from sklearn.linear_model import LinearRegression

lin_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', LinearRegression()),
])

In [13]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    ('regression', DecisionTreeRegressor(random_state=42)),
])

In [14]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        RandomForestRegressor(
            random_state=42,
            n_jobs=-1,
        ),
    ),
])

In [15]:
from sklearn.ensemble import HistGradientBoostingRegressor

hist_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        HistGradientBoostingRegressor(
            random_state=42,
        ),
    ),
])

In [16]:
from sklearn.ensemble import ExtraTreesRegressor

et_reg = Pipeline([
    ('preprocessing', preprocessing_pipe),
    (
        'regression',
        ExtraTreesRegressor(
            random_state=42,
            n_jobs=-1,
        ),
    ),
])

In [17]:
from typing import Any

experiments: list[dict[str, Any]] = [
    {
        'name': 'Linear Regression',
        'model': lin_reg,
    },
    {
        'name': 'Decision Tree',
        'model': tree_reg,
    },
    {
        'name': 'Random Forest',
        'model': forest_reg
    },
    {
        'name': 'Histogram Gradient Boosting',
        'model': hist_reg,
    },
    {
        'name': 'Extra Trees',
        'model': et_reg,
    },
]

ATIVIDADE DE AULA COMEÇA AQUI

NÍVEL 2:

In [None]:
# guardar o desempenho de cada modelo
from sklearn.metrics import root_mean_squared_error, mean_squared_error
# printar o tempo pode ajudar a analisar o melhor modelo, pela rapidez
# para isso, importar a biblioteca time 

desempenho = {}
matrics = []
for experiment in experiments:
    nome = experiment['name']
    modelo = experiment['model']
    
    modelo.fit(X_train_val, y_train_val)
    y_pred = modelo.predict(X_test_val)
    rmse = root_mean_squared_error(y_pred, y_test_val)
    
    desempenho[nome] = rmse
    matrics.append(rmse)
    #print(f'{nome}: RMSE = {rmse:.4f}')

menor_erro = min(desempenho, key=desempenho.get)
# menor_erro = min(matrics)
print(f'Modelo com menor erro: {menor_erro} com RMSE = {desempenho[menor_erro]:.4f}')


Modelo com menor erro: Histogram Gradient Boosting com RMSE = 0.0852


MODELO ESCOLHIDO: Histogram Gradient Boosting

NÍVEL 1:

In [19]:
for experiment in experiments:
    if experiment['name'] == menor_erro:
        best_model = experiment['model']
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        rmse = root_mean_squared_error(y_pred, y_test)
        print(f'RMSE = {rmse:.4f}')

RMSE = 0.0848


NÍVEL 0:

In [20]:
for experiment in experiments:
    if experiment['name'] == menor_erro:
        best_model = experiment['model']
        best_model.fit(X, y)
        y_pred = best_model.predict(X)
        rmse = root_mean_squared_error(y_pred, y)
        print(f'RMSE = {rmse:.4f}')

RMSE = 0.0700


DEPLOY

In [None]:
# fazer o deploy do modelo com joblib
from joblib import dump
final_model = experiments[3]["model"].fit(X, y)
dump(final_model, 'modelo_task_02.joblib')  # Salvar

['modelo.joblib']

FIM ATIVIDADE DE AULA

In [None]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.metrics import root_mean_squared_error


def train_and_evaluate_model(
    model: BaseEstimator,
    X_train: pd.DataFrame,
    y_train: np.array,
    X_test: pd.DataFrame,
    y_test: np.array,
) -> float:
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    rmse_train = root_mean_squared_error(y_train, y_train_pred)
    rmse_test = root_mean_squared_error(y_test, y_test_pred)

    return rmse_train, rmse_test

In [None]:
from collections import defaultdict
from time import perf_counter

results = defaultdict(list)

for experiment in experiments:
    name = experiment['name']
    model = experiment['model']

    print(f'Running experiment: {name}')

    start_time = perf_counter()
    rmse_train, rmse_test = train_and_evaluate_model(
        model,
        X_train_val,
        y_train_val,
        X_test_val,
        y_test_val,
    )
    end_time = perf_counter()
    elapsed_time = end_time - start_time
    
    percentage_error_train = 100.0 * (10.0**rmse_train - 1.0)
    percentage_error_test = 100.0 * (10.0**rmse_test - 1.0)

    experiment['rmse_train'] = rmse_train
    experiment['rmse_test'] = rmse_test

    results['name'].append(name)
    results['rmse_train'].append(rmse_train)
    results['rmse_test'].append(rmse_test)
    results['percentage_error_train'].append(percentage_error_train)
    results['percentage_error_test'].append(percentage_error_test)
    results['elapsed_time_seconds'].append(elapsed_time)

print('Done!')

In [None]:
results_df = pd.DataFrame(results)
results_df.round(2)

In [None]:
best_model = min(experiments, key=lambda x: x['rmse_test'])
print(f'\nBest model: {best_model["name"]}')

In [None]:
model = best_model['model']

model.fit(X_train_val, y_train_val)

y_train_pred = model.predict(X_train_val)
y_test_pred = model.predict(X_test_val)

rmse_train = root_mean_squared_error(y_train_val, y_train_pred)
rmse_test = root_mean_squared_error(y_test_val, y_test_pred)

print(f'\nTraining RMSE: {rmse_train:.4f}')
print(f'Test RMSE: {rmse_test:.4f}')

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_train_val, y_train_pred, alpha=0.05)
plt.plot(
    [min(y_train_val), max(y_train_val)],
    [min(y_train_val), max(y_train_val)],
    color='red',
)
plt.xlabel('True log median house value')
plt.ylabel('Predicted log median house value')
plt.title('True vs. predicted log median house value')
plt.axis('equal')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test_val, y_test_pred, alpha=0.1)
plt.plot(
    [min(y_test_val), max(y_test_val)],
    [min(y_test_val), max(y_test_val)],
    color='red',
)
plt.xlabel('True log median house value')
plt.ylabel('Predicted log median house value')
plt.title('True vs. predicted log median house value')
plt.axis('equal')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_train_val, y_train_pred - y_train_val, alpha=0.05)
plt.plot([min(y_train_val), max(y_train_val)], [0, 0], color='red')
plt.xlabel('True log median house value')
plt.ylabel('Residual')
plt.title('Residual plot')
plt.axis('equal')
plt.show()

## Avaliação