In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("/home/mgabriel4/Documentos/GitHub/machine-learning/data/AmesHousing.csv")


Primeiro, precisamos definir os valores de X e Y, sendo X as features e Y a target.

In [16]:
X = df.drop(columns=["SalePrice", "Order", "PID"])  #retirando as variáveis inúteis
y = df["SalePrice"]

Logo após a separação, temos o pré-processamento das variáveis.

In [17]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),  # completa NA numérico
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # completa NA categórico
        ("onehot", OneHotEncoder(handle_unknown="ignore")),    # dummies
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

In [19]:
gb_model = GradientBoostingRegressor(
    random_state=42
    # você pode tunar: n_estimators, learning_rate, max_depth, etc.
)

# 6) Montar o pipeline completo
pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("gb", gb_model),
    ]
)

# 7) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8) Treinar
pipe.fit(X_train, y_train)

# 9) Predizer e avaliar
y_pred = pipe.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:,.2f}")
print(f"R²:   {r2:.3f}")


RMSE: 26,071.65
R²:   0.915
