In [None]:
import pandas as pd

# Modelo linear

## Lendo o dataset

In [None]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True, scaled=False)

In [None]:
print(data.DESCR)

In [None]:
data.frame.head()

In [None]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(
    data.frame,
    test_size=0.2,
    random_state=42,
)

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe().round(2).transpose()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(
    train_dataset.corr(),
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
)
plt.title('Correlation Matrix')
plt.show()

In [None]:
X_train = train_dataset.drop(columns='target')
y_train = train_dataset['target'].values

X_test = test_dataset.drop(columns='target')
y_test = test_dataset['target'].values


## Um modelo linear simples e alguns mais "chiques"

### Uma mega-função para rodar o experimento completo para cada modelo

In [None]:
from sklearn.metrics import root_mean_squared_error

import matplotlib.pyplot as plt


def run_experiment(
    model,
    get_model_params_fn,
    X_train,
    y_train,
    X_test,
    y_test,
):

    def compute_predictions(fitted_model, X_train, X_test):
        y_train_pred = fitted_model.predict(X_train)
        y_test_pred = fitted_model.predict(X_test)

        return y_train_pred, y_test_pred

    def compute_rmse(fitted_model, y, y_pred):
        return root_mean_squared_error(y, y_pred)

    def print_rmse(label, rmse):
        print(f'{label} RMSE: {rmse:.2f}')

    def print_linear_model_formula(intercept, coefs):
        print('y_pred = ')
        print(f'      ({intercept: 8.2f})')
        for i, coef in enumerate(coefs):
            print(f'    + ({coef: 8.2f}) * X_{i+1}')
        print()

        fitted_model = model.fit(X_train, y_train)
        y_train_pred, y_test_pred = compute_predictions(
            fitted_model,
            X_train,
            X_test,
        )

    def plot_residuals(y_train, y_train_pred, y_test, y_test_pred):

        def plot_scatter_and_residuals(
            y,
            y_pred,
            color,
            label,
        ):
            plt.scatter(
                y,
                y_pred,
                color=color,
                marker='o',
                label=label,
                alpha=0.5,
            )
            for y_value, y_pred_value in zip(y, y_pred):
                plt.plot(
                    [y_value, y_value],
                    [y_value, y_pred_value],
                    color=color,
                    linestyle='--',
                    lw=0.5,
                    alpha=0.5,
                )

        plt.figure(figsize=(12, 6))

        plot_scatter_and_residuals(y_train, y_train_pred, 'blue', 'Train')
        plot_scatter_and_residuals(y_test, y_test_pred, 'orange', 'Test')

        min_y = min(y_train.min(), y_test.min())
        max_y = max(y_train.max(), y_test.max())
        plt.plot(
            [min_y, max_y],
            [min_y, max_y],
            'k--',
            lw=2,
            label='Perfect prediction',
        )
        plt.xlim(min_y, max_y)
        plt.ylim(min_y, max_y)
        plt.gca().set_aspect('equal', adjustable='box')
        plt.grid()
        plt.xlabel('True values')
        plt.ylabel('Predicted values')
        plt.title('True vs Predicted values')
        plt.legend()
        plt.tight_layout()
        plt.show()

    fitted_model = model.fit(X_train, y_train)

    y_train_pred, y_test_pred = compute_predictions(
        fitted_model,
        X_train,
        X_test,
    )

    train_rmse = compute_rmse(fitted_model, y_train, y_train_pred)
    test_rmse = compute_rmse(fitted_model, y_test, y_test_pred)

    print_rmse('Train', train_rmse)
    print_rmse('Test', test_rmse)

    intercept, coefs = get_model_params_fn(fitted_model)
    print_linear_model_formula(intercept, coefs)

    plot_residuals(
        y_train,
        y_train_pred,
        y_test,
        y_test_pred,
    )


### Definindo os modelos

#### O modelo mais simples possível

In [None]:
from sklearn.linear_model import LinearRegression


def create_simple_linear_model():
    return LinearRegression()


def get_simple_model_params(model):
    return model.intercept_, model.coef_

#### Um modelo simples mas precedido de uma pipeline de pré-processamento

- O código da pipeline de pré-processamento

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


def create_pipeline_model_with_preprocessing(model):

    def create_basic_preprocessing_pipeline():
        # Create the various stages of the Pipeline. You could do it directly 
        # in the Pipeline constructor, but this is more readable, though a bit
        # more verbose.

        # The imputer will replace missing values with the mean of the column.
        imputer_stage = SimpleImputer(strategy='mean')

        # The polynomial features stage will create polynomial features of 
        # degree 2.
        poly_stage = PolynomialFeatures(degree=2, include_bias=False)

        # The scaler will standardize the features (mean=0, variance=1).
        scaler_stage = StandardScaler()

        # Now join them together in a Pipeline.
        preprocessing_pipe = Pipeline([
            ('imputer', imputer_stage),
            ('poly', poly_stage),
            ('scaler', scaler_stage),
        ])

        return preprocessing_pipe

    # Create the preprocessing pipeline.
    preprocessing_pipe = create_basic_preprocessing_pipeline()

    # Join the preprocessing and model stages in a single pipeline.
    # This is the final pipeline that will be used for training and prediction.
    pipe = Pipeline([
        ('preprocessing', preprocessing_pipe),
        ('model', model),
    ])

    return pipe


def get_pipeline_model_params(model):
    # Get the model stage from the pipeline.
    model_stage = model.named_steps['model']

    # Get the intercept and coefficients from the model.
    intercept = model_stage.intercept_
    coefs = model_stage.coef_

    return intercept, coefs

- O modelo simples com pré-processamento

In [None]:
def create_linear_model_with_preprocessing():
    # Create the regression model.
    model = LinearRegression()

    # Create the pipeline model.
    pipe = create_pipeline_model_with_preprocessing(model)

    return pipe

#### Um modelo de regressão Ridge com pre-processamento

In [None]:
from sklearn.linear_model import Ridge

def create_ridge_model_with_preprocessing():
    # Create the regression model with Ridge regularization.
    model = Ridge(alpha=5.0)

    # Create the pipeline model.
    pipe = create_pipeline_model_with_preprocessing(model)

    return pipe

#### Um modelo de regressão Lasso com pre-processamento

In [None]:
from sklearn.linear_model import Lasso

def create_lasso_model_with_preprocessing():
    # Create the regression model with Ridge regularization.
    model = Lasso(alpha=5.0)

    # Create the pipeline model.
    pipe = create_pipeline_model_with_preprocessing(model)

    return pipe

## Cria os modelos

In [None]:
simple_linear_model = create_simple_linear_model()
simple_linear_model

In [None]:
linear_model = create_linear_model_with_preprocessing()
linear_model

In [None]:
ridge_model = create_ridge_model_with_preprocessing()
ridge_model

In [None]:
lasso_model = create_lasso_model_with_preprocessing()
lasso_model

## Escolha de modelo

### Versão simples: train-test-val split

In [None]:
(
    X_train_val,
    X_test_val,
    y_train_val,
    y_test_val,
) = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42,
)

In [None]:
run_experiment(
    simple_linear_model,
    get_simple_model_params,
    X_train_val,
    y_train_val,
    X_test_val,
    y_test_val,
)

In [None]:
run_experiment(
    linear_model,
    get_pipeline_model_params,
    X_train_val,
    y_train_val,
    X_test_val,
    y_test_val,
)

In [None]:
run_experiment(
    ridge_model,
    get_pipeline_model_params,
    X_train_val,
    y_train_val,
    X_test_val,
    y_test_val,
)

In [None]:
run_experiment(
    lasso_model,
    get_pipeline_model_params,
    X_train_val,
    y_train_val,
    X_test_val,
    y_test_val,
)

### Versão mais "sofisticada": validação cruzada 

In [None]:
from sklearn.model_selection import cross_val_score

cv = 5

simple_model_cv = cross_val_score(
    simple_linear_model,
    X_train,
    y_train,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)

linear_model_cv = cross_val_score(
    linear_model,
    X_train,
    y_train,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)

ridge_model_cv = cross_val_score(
    ridge_model,
    X_train,
    y_train,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)

lasso_model_cv = cross_val_score(
    lasso_model,
    X_train,
    y_train,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)

In [None]:
def print_cv_results(model_name, cv_results):
    # Convert negative RMSE to positive RMSE.
    cv_results = -cv_results

    # Print the cross-validation results.
    print(f'{model_name} CV results:')
    print(cv_results)

    # Compute mean and standard deviation of RMSE.
    mean_rmse = cv_results.mean()
    std_rmse = cv_results.std()

    # Print the results.
    print(f'RMSE: {mean_rmse:.2f} +/- {std_rmse:.2f}')
    print()

print_cv_results('Simple Linear Model', simple_model_cv)
print_cv_results('Linear Model', linear_model_cv)
print_cv_results('Ridge Model', ridge_model_cv)
print_cv_results('Lasso Model', lasso_model_cv)

### Escolha de hiperparâmetros com validação cruzada

- Modelo simples não tem nada para escolher, o desempenho é aquele mesmo.

- Os demais modelos tem escolhas a serem feitas no estágio de pré-processamento.

- Os modelos "ridge" e "lasso" tem a escolha do hiperparâmetro `alpha`

- Desempenho do modelo linear simples é obtido simplesmente com `cross_val_score`, como feito acima, não precisa repetir:

In [None]:
print_cv_results('Simple Linear Model', simple_model_cv)

In [None]:
import numpy as np

param_grid_preprocessing = {
    'preprocessing__poly__degree': [1, 2],
    'preprocessing__scaler__with_mean': [True, False],
    'preprocessing__scaler__with_std': [True, False],
}

param_grid_model = {
    'model__alpha': np.logspace(-1, 5, 6),
}

- Calcula o desempenho do modelo linear com pré-processamento

In [None]:
from sklearn.model_selection import GridSearchCV

grid_linear_model = GridSearchCV(
    linear_model,
    param_grid_preprocessing,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)
grid_linear_model.fit(X_train, y_train)

In [None]:
print('Best parameters for linear model:')
print(grid_linear_model.best_params_)

In [None]:
best_linear_model = grid_linear_model.best_estimator_
best_linear_model

In [None]:
print('Best score for ridge model:')
print(grid_linear_model.best_score_.round(2))

In [None]:
pd.DataFrame(grid_linear_model.cv_results_) \
    .sort_values(by='rank_test_score') \
    .iloc[:10, :] \
    .loc[:, ['params', 'mean_test_score', 'std_test_score']] \
    .round(2)


- Calcula o desempenho do modelo Ridge

In [None]:
# Note o truque de Python para concatenar dicionários.
# O operador ** é usado para expandir o dicionário.
# Isso é útil para combinar os parâmetros de pré-processamento e do modelo.
param_grid_ridge = {
    **param_grid_preprocessing,
    **param_grid_model,
}

grid_ridge_model = GridSearchCV(
    ridge_model,
    param_grid_ridge,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)
grid_ridge_model.fit(X_train, y_train)

In [None]:
print('Best parameters for ridge model:')
print(grid_ridge_model.best_params_)

In [None]:
print('Best score for ridge model:')
print(grid_ridge_model.best_score_.round(2))

In [None]:
pd.DataFrame(grid_ridge_model.cv_results_) \
    .sort_values(by='rank_test_score') \
    .iloc[:10, :] \
    .loc[:, ['params', 'mean_test_score', 'std_test_score']] \
    .round(2)


- Calcula o desempenho do modelo Lasso

In [None]:
# Note o truque de Python para concatenar dicionários.
# O operador ** é usado para expandir o dicionário.
# Isso é útil para combinar os parâmetros de pré-processamento e do modelo.
param_grid_lasso = {
    **param_grid_preprocessing,
    **param_grid_model,
}

grid_lasso_model = GridSearchCV(
    lasso_model,
    param_grid_lasso,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
)
grid_lasso_model.fit(X_train, y_train)

In [None]:
print('Best parameters for lasso model:')
print(grid_lasso_model.best_params_)

In [None]:
print('Best score for lasso model:')
print(grid_lasso_model.best_score_.round(2))

In [None]:
pd.DataFrame(grid_lasso_model.cv_results_) \
    .sort_values(by='rank_test_score') \
    .iloc[:10, :] \
    .loc[:, ['params', 'mean_test_score', 'std_test_score']] \
    .round(2)


- Sumário dos melhores escores:

In [None]:
print(f'Simple linear model: {simple_model_cv.mean().round(2)}')
print(f'Linear model: {grid_linear_model.best_score_.round(2)}')
print(f'Ridge: {grid_ridge_model.best_score_.round(2)}')
print(f'Lasso: {grid_lasso_model.best_score_.round(2)}')

## Atividades

- Verifique os coeficientes do melhor modelo LASSO para ver quais *features* são mais relevantes.

- Treine um modelo linear sem regularização com apenas as features escolhidas pelo LASSO, compare com o modelo linear completo
