In [None]:
# ======================================================
# Introdução
# ======================================================
# Este notebook foi preparado para ser executado no Google Colab.
# Objetivo: treinar um modelo de Machine Learning (ex.: Rede Neural)
# para prever o valor mediano de casas na Califórnia.
#
# Dataset: California Housing (scikit-learn)
#
# Fluxo:
#   1. Carregar dataset
#   2. Dividir em treino/teste
#   3. Pré-processamento com ColumnTransformer
#   4. Baseline com DummyRegressor
#   5. Treinar seu modelo de ML (ex.: Rede Neural MLP)
#   6. Avaliar métricas (MAE, RMSE, R²)
#   7. Adicione PCA ao código e treine novamente o modelo de Rede Neural.

In [10]:
# ======================================================
# 1) Introdução
# ======================================================
# Prevendo o valor mediano de casas na Califórnia com Scikit-Learn.
# Fluxo: baseline -> modelo MLP -> comparação com e sem PCA.

import os
import random
import joblib
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor

# Semente para reprodutibilidade
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)


In [11]:
# ======================================================
# 2) Funções Auxiliares
# ======================================================

def load_california_housing():
    ds = fetch_california_housing(as_frame=True)
    frame = ds.frame.copy()
    y = frame["MedHouseVal"]
    X = frame.drop(columns=["MedHouseVal"])
    numeric_features = X.columns.tolist()
    categorical_features = []  # não há colunas categóricas
    return X, y, numeric_features, categorical_features


def build_preprocessor(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
        ]
    )
    return preprocessor


def evaluate(model, X_test, y_test, label="Modelo"):
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"[{label}]  MAE={mae:.4f} | RMSE={rmse:.4f} | R²={r2:.4f}")
    return {"MAE": mae, "RMSE": rmse, "R2": r2}


In [12]:
# ======================================================
# 3) Carregar Dados e Split
# ======================================================

X, y, num_feats, cat_feats = load_california_housing()
print(f"Dataset carregado. X shape={X.shape} | y shape={y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_SEED
)

preprocessor = build_preprocessor(num_feats, cat_feats)


Dataset carregado. X shape=(20640, 8) | y shape=(20640,)


In [13]:
# ======================================================
# 4) Baseline (DummyRegressor)
# ======================================================

baseline = Pipeline(steps=[
    ("pre", preprocessor),
    ("reg", DummyRegressor(strategy="mean"))
])

baseline.fit(X_train, y_train)
evaluate(baseline, X_test, y_test, label="DummyRegressor (Baseline)")


[DummyRegressor (Baseline)]  MAE=0.9061 | RMSE=1.1449 | R²=-0.0002


{'MAE': 0.9060685490007149,
 'RMSE': 1.1448563543099792,
 'R2': -0.00021908714592466794}

In [14]:
# ======================================================
# 5) Modelo MLP sem PCA
# ======================================================

model_no_pca = Pipeline(steps=[
    ("pre", preprocessor),
    ("reg", MLPRegressor(
        hidden_layer_sizes=(12, 6),
        activation="relu",
        learning_rate_init=0.001,
        max_iter=800,
        random_state=RANDOM_SEED
    ))
])

model_no_pca.fit(X_train, y_train)
evaluate(model_no_pca, X_test, y_test, label="Rede Neural (MLP) sem PCA")


[Rede Neural (MLP) sem PCA]  MAE=0.3675 | RMSE=0.5428 | R²=0.7752


{'MAE': 0.36752018639875844,
 'RMSE': 0.54278293994409,
 'R2': 0.7751745258264024}

In [15]:
# ======================================================
# 6) Modelo MLP com PCA
# ======================================================

model_pca = Pipeline(steps=[
    ("pre", preprocessor),
    ("pca", PCA(n_components=0.95, random_state=RANDOM_SEED)),
    ("reg", MLPRegressor(
        hidden_layer_sizes=(12, 6),
        activation="relu",
        learning_rate_init=0.001,
        max_iter=800,
        random_state=RANDOM_SEED
    ))
])

# Validação cruzada
cv_scores = cross_val_score(model_pca, X_train, y_train, cv=5, scoring="r2")
print(f"Validação Cruzada R² (média ± desvio): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Treino final
model_pca.fit(X_train, y_train)
metrics = evaluate(model_pca, X_test, y_test, label="Rede Neural (MLP) com PCA")


Validação Cruzada R² (média ± desvio): 0.6853 ± 0.0035
[Rede Neural (MLP) com PCA]  MAE=0.4609 | RMSE=0.6455 | R²=0.6821


In [16]:
# ======================================================
# 7) Salvando Artefatos
# ======================================================

os.makedirs("artifacts", exist_ok=True)

# Salvar modelo treinado
joblib.dump(model_pca, "artifacts/model_california.joblib")

# Salvar métricas
with open("artifacts/metrics.txt", "w") as f:
    for k, v in metrics.items():
        f.write(f"{k}: {v}\n")

print("✅ Artefatos salvos em ./artifacts")


✅ Artefatos salvos em ./artifacts
