# 02 Modelacao - Apartment (Full)

Objetivo: comparar LightGBM e XGBoost com grid search simples.


## Setup e leitura


In [None]:
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from config import paths

In [None]:
train_path = paths.CURATED / 'apartment_train.parquet'
valid_path = paths.CURATED / 'apartment_valid.parquet'
if not train_path.exists() or not valid_path.exists():
    raise FileNotFoundError(
        'Dados não encontrados. Executa notebooks/eda/01_data_understanding_apartment.ipynb.'
    )
df_train = pd.read_parquet(train_path)
df_valid = pd.read_parquet(valid_path)
df_train.shape, df_valid.shape


((32853, 10), (8214, 10))

## Preparar features

In [4]:
def prepare_xy(df):
    df_model = df.copy()
    df_model['log_price'] = np.log1p(df_model['Price'])

    features = [
        'LivingArea',
        'TotalArea',
        'NumberOfBathrooms',
        'Parking',
        'PropertyAge',
        'District',
        'City',
        'Town',
    ]

    target = 'log_price'
    cat_cols = ['District', 'City', 'Town']

    X = df_model[features].copy()
    y = df_model[target].copy()

    return X, y, features, cat_cols

X_train, y_train, FEATURES, cat_cols = prepare_xy(df_train)
X_valid, y_valid, _, _ = prepare_xy(df_valid)


## LightGBM - Randomized Search


In [5]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

for c in cat_cols:
    X_train[c] = X_train[c].astype('category')
    X_valid[c] = X_valid[c].astype('category')

lgbm = lgb.LGBMRegressor(random_state=42)
param_dist = {
    'n_estimators': [400, 800, 1200],
    'learning_rate': [0.03, 0.05, 0.1],
    'num_leaves': [31, 63, 127],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

lgbm_search = RandomizedSearchCV(
    lgbm,
    param_distributions=param_dist,
    n_iter=12,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
)

lgbm_search.fit(X_train, y_train, categorical_feature=cat_cols)
lgbm_search.best_params_


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1370
[LightGBM] [Info] Number of data points in the train set: 32853, number of used features: 8
[LightGBM] [Info] Start training from score 12.623518


{'subsample': 1.0,
 'num_leaves': 127,
 'n_estimators': 800,
 'learning_rate': 0.1,
 'colsample_bytree': 0.8}

In [None]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

best_lgbm = lgbm_search.best_estimator_
pred_log = best_lgbm.predict(X_valid)

# Previsões e valores reais
pred_log = best_lgbm.predict(X_valid)
pred = np.expm1(pred_log)
true = np.expm1(y_valid)

# Métricas
lgbm_rmse = root_mean_squared_error(true, pred)
lgbm_mae = mean_absolute_error(true, pred)
lgbm_mape = (np.abs(true - pred) / true).mean() * 100
lgbm_r2 = r2_score(true, pred)  

# Dicionário final com métricas
lgbm_metrics = {
    'rmse': lgbm_rmse,
    'mae': lgbm_mae,
    'mape': lgbm_mape,
    'r2': lgbm_r2
}

lgbm_metrics


{'rmse': 101381.06293115053,
 'mae': 44678.9764268849,
 'mape': np.float64(12.081724726128225),
 'r2': 0.9061849648504406}

## XGBoost - Randomized Search


In [None]:
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

num_cols = [c for c in FEATURES if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols),
    ]
)

xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    tree_method='hist',
)

xgb_pipe = Pipeline(steps=[('preprocess', preprocess), ('model', xgb)])

xgb_param_dist = {
    'model__n_estimators': [400, 800, 1200],
    'model__learning_rate': [0.03, 0.05, 0.1],
    'model__max_depth': [4, 6, 8],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0],
}

xgb_search = RandomizedSearchCV(
    xgb_pipe,
    param_distributions=xgb_param_dist,
    n_iter=12,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
)

xgb_search.fit(X_train, y_train)
xgb_search.best_params_

{'model__subsample': 0.8,
 'model__n_estimators': 800,
 'model__max_depth': 8,
 'model__learning_rate': 0.1,
 'model__colsample_bytree': 1.0}

In [None]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

best_xgb = xgb_search.best_estimator_

# Previsões e valores reais
pred_log = best_xgb.predict(X_valid)
pred = np.expm1(pred_log)
true = np.expm1(y_valid)

# Métricas
xgb_rmse = root_mean_squared_error(true, pred)
xgb_mae = mean_absolute_error(true, pred)
xgb_mape = (np.abs(true - pred) / true).mean() * 100
xgb_r2 = r2_score(true, pred) 

# métricas
xgb_metrics = {
    'rmse': xgb_rmse,
    'mae': xgb_mae,
    'mape': xgb_mape,
    'r2': xgb_r2
}

xgb_metrics

{'rmse': 110170.83680803323,
 'mae': 49964.85676070389,
 'mape': np.float64(13.506915619014121),
 'r2': 0.8892121686143524}

## Comparação e guardar melhor modelo


In [16]:
results = pd.DataFrame(
    [
        {'model': 'lgbm', 'rmse': lgbm_rmse, 'mae': lgbm_mae, 'mape': lgbm_mape, 'r2': lgbm_r2},
        {'model': 'xgb',  'rmse': xgb_rmse,  'mae': xgb_mae,  'mape': xgb_mape,  'r2': xgb_r2},
    ]
).sort_values('rmse', ascending=True)



In [17]:
run_id = datetime.now().strftime('%Y%m%d_%H%M%S')

def save_run(model_name, model_obj, metrics, params):
    run_dir = paths.EXPERIMENTS / 'apartment' / model_name / f'run_{run_id}'
    run_dir.mkdir(parents=True, exist_ok=True)

    payload = {'model': model_obj, 'features': FEATURES, 'cat_cols': cat_cols, 'type': model_name}
    run_model_path = run_dir / 'model.pkl'
    joblib.dump(payload, run_model_path)

    (run_dir / 'metrics.json').write_text(json.dumps(metrics, indent=2), encoding='utf-8')
    (run_dir / 'params.json').write_text(json.dumps(params, indent=2), encoding='utf-8')

    root_model_path = paths.MODELS / f'apartment_{model_name}.pkl'
    root_model_path.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(payload, root_model_path)

    return run_dir

lgbm_run_dir = save_run('lgbm', best_lgbm, lgbm_metrics, lgbm_search.best_params_)
xgb_run_dir = save_run('xgb', best_xgb, xgb_metrics, xgb_search.best_params_)

best_row = results.iloc[0]['model']
root_best_path = paths.MODELS / 'apartment_best.pkl'
root_best_path.parent.mkdir(parents=True, exist_ok=True)

if best_row == 'lgbm':
    payload = {'model': best_lgbm, 'features': FEATURES, 'cat_cols': cat_cols, 'type': 'lgbm'}
else:
    payload = {'model': best_xgb, 'features': FEATURES, 'cat_cols': cat_cols, 'type': 'xgb'}

joblib.dump(payload, root_best_path)
root_best_path


WindowsPath('C:/denv_testes/pt-real-estate-deal-engine/models/apartment_best.pkl')