In [1]:
# 1) Instalar (en el kernel actual)
%pip install -q --upgrade pip
%pip install lightgbm catboost

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# =========================================
# 1) Cargar datos y objetivo (Housing.csv)
# =========================================
import os, json, warnings, platform, datetime
import numpy as np
import pandas as pd
import joblib
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_FILE = "Housing.csv"   # <-- nuestro archivo
TARGET    = "price"         # variable objetivo continua (regresión)
assert os.path.exists(DATA_FILE), f"No se encuentra {DATA_FILE}"

df = pd.read_csv(DATA_FILE)

print("\n=== Info inicial del dataset ===")
df.info()

# --------- CONFIG DE UMBRAL PARA price_clase ---------
USE_MANUAL_THRESHOLD = False   # True => usar un umbral fijo
MANUAL_THRESHOLD     = 3_000_000  # cambia este número si activas el manual

if USE_MANUAL_THRESHOLD:
    threshold = float(MANUAL_THRESHOLD)
    threshold_source = "MANUAL"
else:
    threshold = float(df[TARGET].median())
    threshold_source = "MEDIANA"

# Crear columna auxiliar binaria: MENOR/MAYOR según umbral
df["price_clase"] = np.where(df[TARGET] >= threshold, "MAYOR", "MENOR").astype(str)

print(f"\n=== price_clase creado con umbral [{threshold_source}] = {threshold:,.2f} ===")
print("Regla: price >= umbral => 'MAYOR'; price < umbral => 'MENOR'")

print("\nConteo de price_clase:")
print(df["price_clase"].value_counts())

print("\nResumen de precios por clase (min/median/max/count):")
print(
    df.groupby("price_clase")[TARGET]
      .agg(["min","median","max","count"])
      .sort_index()
      .to_string()
)

# (Chequeo rápido)
min_mayor = df.loc[df["price_clase"]=="MAYOR", TARGET].min()
max_menor = df.loc[df["price_clase"]=="MENOR", TARGET].max()
print(f"\nChequeo: min(MAYOR)={min_mayor:,.2f}  vs  max(MENOR)={max_menor:,.2f}")

# Definir y (target) y X (features)
y = df[TARGET]
X = df.drop(columns=[TARGET])

print("\nShape:", X.shape,
      "| y(mean):", round(y.mean(), 4),
      "| y(std):",  round(y.std(), 4),
      "| y[min,max]:", (round(y.min(), 4), round(y.max(), 4)))



=== Info inicial del dataset ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB

=== price_clase creado con umbral [MEDIANA] = 4,340,000.00 ===
Regla: price >= umbral => 'MAYOR'; price < umbral => 'ME

In [2]:
X , y

(     area  bedrooms  bathrooms  stories mainroad guestroom basement  \
 0    7420         4          2        3      yes        no       no   
 1    8960         4          4        4      yes        no       no   
 2    9960         3          2        2      yes        no      yes   
 3    7500         4          2        2      yes        no      yes   
 4    7420         4          1        2      yes       yes      yes   
 ..    ...       ...        ...      ...      ...       ...      ...   
 540  3000         2          1        1      yes        no      yes   
 541  2400         3          1        1       no        no       no   
 542  3620         2          1        1      yes        no       no   
 543  2910         3          1        1       no        no       no   
 544  3850         3          1        2      yes        no       no   
 
     hotwaterheating airconditioning  parking prefarea furnishingstatus  \
 0                no             yes        2      yes     

In [3]:
# =========================================
# 2) Split temprano (80/20)
# =========================================
from sklearn.model_selection import train_test_split
import pandas as pd

# Evitar fuga de datos: quitar cualquier columna derivada del target
LEAKAGE_COLS = ["price_clase"]  # auxiliar EDA, no debe ir a features
X_noleak = X.drop(columns=[c for c in LEAKAGE_COLS if c in X.columns], errors="ignore")

# (Opcional) estratificación por la clase auxiliar para mantener distribución
USE_STRATIFY = True and ("price_clase" in df.columns)
stratify_vec = df["price_clase"] if USE_STRATIFY else None

if stratify_vec is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X_noleak, y, test_size=0.20, random_state=RANDOM_STATE, stratify=stratify_vec
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_noleak, y, test_size=0.20, random_state=RANDOM_STATE
    )

print(f"Train: {X_train.shape} | Test: {X_test.shape}")
print("¿'price_clase' en X_train?", "price_clase" in X_train.columns)

# (Diagnóstico opcional de la estratificación)
if "price_clase" in df.columns:
    train_idx = X_train.index
    test_idx  = X_test.index
    print("\nDistribución price_clase (proporción):")
    print("Train:\n", df.loc[train_idx, "price_clase"].value_counts(normalize=True).round(3))
    print("Test:\n",  df.loc[test_idx,  "price_clase"].value_counts(normalize=True).round(3))


Train: (436, 12) | Test: (109, 12)
¿'price_clase' en X_train? False

Distribución price_clase (proporción):
Train:
 price_clase
MAYOR    0.505
MENOR    0.495
Name: proportion, dtype: float64
Test:
 price_clase
MAYOR    0.505
MENOR    0.495
Name: proportion, dtype: float64


In [4]:
# =========================================
# 3) Preprocesamiento (en pipeline)
# =========================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold

# Fallback si no tienes imblearn instalado
try:
    from imblearn.pipeline import Pipeline as ImbPipeline  # imblearn solo por consistencia de API
except Exception:
    from sklearn.pipeline import Pipeline as ImbPipeline
    print("[AVISO] imblearn no disponible; usando sklearn.Pipeline como fallback.")

# Detectar tipos desde el split ya hecho (X_train)
cat_features = X_train.select_dtypes(include=["object","category"]).columns.tolist()
num_features = X_train.select_dtypes(include=["number","bool"]).columns.tolist()

# OneHotEncoder compatible (con fallback segun versión de sklearn)
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", ohe,              cat_features),
    ],
    remainder="drop",
)

def build_pipe(model):
    # Nota: en regresión NO se usa SMOTE
    return ImbPipeline([
        ("prep", preprocessor),
        ("var0", VarianceThreshold(0.0)),  # limpia columnas constantes tras OHE
        ("model", model),
    ])

print(f"Features numéricas ({len(num_features)}): {num_features}")
print(f"Features categóricas ({len(cat_features)}): {cat_features}")


Features numéricas (5): ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
Features categóricas (7): ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


In [7]:
# =========================================
# 4) Modelos candidatos (REGRESIÓN) - sin prints de training
# =========================================
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

HAVE_XGB = True
HAVE_LGB = True
HAVE_CAT = True

try:
    from xgboost import XGBRegressor
except Exception:
    HAVE_XGB = False

try:
    from lightgbm import LGBMRegressor
except Exception:
    HAVE_LGB = False

try:
    from catboost import CatBoostRegressor
except Exception:
    HAVE_CAT = False

candidates = [
    ("LR",  LinearRegression()),
    ("RG",  Ridge()),  # Ridge no recibe random_state
    ("LS",  Lasso(max_iter=5000)),
    ("EN",  ElasticNet(max_iter=5000)),
    ("KNR", KNeighborsRegressor()),
    ("DTR", DecisionTreeRegressor(random_state=RANDOM_STATE)),
    ("RFR", RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)),
    ("MLP", MLPRegressor(hidden_layer_sizes=(64,), max_iter=800, random_state=RANDOM_STATE)),
]

if HAVE_XGB:
    candidates.append(("XGB", XGBRegressor(
        tree_method="hist", random_state=RANDOM_STATE,
        n_estimators=400, learning_rate=0.05, max_depth=6,
        subsample=0.9, colsample_bytree=0.9, n_jobs=-1,
        verbosity=0               # << silencia logs de XGBoost
    )))

if HAVE_LGB:
    candidates.append(("LGB", LGBMRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=-1,
        subsample=0.9, colsample_bytree=0.9,
        random_state=RANDOM_STATE, n_jobs=-1,
        verbosity=-1,             # << silencia logs de LightGBM
        force_row_wise=True       # << evita el mensaje de auto-choosing
    )))

if HAVE_CAT:
    candidates.append(("CAT", CatBoostRegressor(
        iterations=600, learning_rate=0.05, depth=6,
        random_state=RANDOM_STATE, l2_leaf_reg=3.0,
        verbose=False,            # << silencia logs de CatBoost
        allow_writing_files=False,
        thread_count=-1
    )))

# (comprobación de entorno/candidatos - útil para debug)
import importlib.util
print("xgboost :", importlib.util.find_spec("xgboost")  is not None)
print("lightgbm:", importlib.util.find_spec("lightgbm") is not None)
print("catboost:", importlib.util.find_spec("catboost") is not None)
print("candidatos:", [n for n,_ in candidates], "=>", len(candidates))


xgboost : True
lightgbm: True
catboost: True
candidatos: ['LR', 'RG', 'LS', 'EN', 'KNR', 'DTR', 'RFR', 'MLP', 'XGB', 'LGB', 'CAT'] => 11


In [8]:
# =========================================
# 5) Baseline con CV (sin tuning) - adaptado
# =========================================
from sklearn.model_selection import KFold, cross_validate
import pandas as pd

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae":  "neg_mean_absolute_error",
    "r2":   "r2",
}

rows = []
for name, model in candidates:
    pipe = build_pipe(model)  # usa el preprocessor del Paso 3
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    row = {
        "model": name,
        "rmse": -scores["test_rmse"].mean(),  # pasar a positivo
        "mae":  -scores["test_mae"].mean(),
        "r2":    scores["test_r2"].mean(),
    }
    rows.append(row)
    print(f"{name:>3} | RMSE {row['rmse']:.3f} | MAE {row['mae']:.3f} | R² {row['r2']:.3f}")

baseline_df = pd.DataFrame(rows).sort_values("rmse").reset_index(drop=True)

# display para notebooks, fallback a print si no existe display()
try:
    display(baseline_df)
except Exception:
    print("\nBaseline CV:")
    print(baseline_df.to_string(index=False))

baseline_best_name  = baseline_df.iloc[0]["model"]
baseline_best_model = dict(candidates)[baseline_best_name]
print(f">>> Baseline ganador: {baseline_best_name}")


 LR | RMSE 1082081.838 | MAE 792713.122 | R² 0.656
 RG | RMSE 1081743.895 | MAE 792140.372 | R² 0.656
 LS | RMSE 1082081.859 | MAE 792712.631 | R² 0.656
 EN | RMSE 1146338.654 | MAE 822021.674 | R² 0.622
KNR | RMSE 1181777.394 | MAE 807823.280 | R² 0.593
DTR | RMSE 1729343.472 | MAE 1161767.259 | R² 0.010
RFR | RMSE 1139928.003 | MAE 825509.582 | R² 0.614




MLP | RMSE 5126149.309 | MAE 4770977.038 | R² -6.736
XGB | RMSE 1140259.901 | MAE 817866.062 | R² 0.611
LGB | RMSE 1196549.497 | MAE 848169.670 | R² 0.560
CAT | RMSE 1085242.809 | MAE 768188.571 | R² 0.649


Unnamed: 0,model,rmse,mae,r2
0,RG,1081744.0,792140.4,0.656369
1,LR,1082082.0,792713.1,0.656086
2,LS,1082082.0,792712.6,0.656086
3,CAT,1085243.0,768188.6,0.649025
4,RFR,1139928.0,825509.6,0.614322
5,XGB,1140260.0,817866.1,0.611027
6,EN,1146339.0,822021.7,0.621653
7,KNR,1181777.0,807823.3,0.59341
8,LGB,1196549.0,848169.7,0.559828
9,DTR,1729343.0,1161767.0,0.009863


>>> Baseline ganador: RG


In [9]:
# =========================================
# 6) Tuning con CV y elección del ganador (rápido) - silencioso
# =========================================
import tempfile, shutil, numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV
from scipy.stats import randint, uniform
try:
    from scipy.stats import loguniform
except Exception:
    from sklearn.utils.fixes import loguniform

# CV "ligero" para modelos simples, "pesado" para tree/boosters
cv_light = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_heavy = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# Espacios de búsqueda (usamos el prefijo 'model__' porque en build_pipe el estimador se llama 'model')
param_spaces = {
    "RG":  {"model__alpha": loguniform(1e-3, 1e3)},
    "LS":  {"model__alpha": loguniform(1e-3, 1e2)},
    "EN":  {"model__alpha": loguniform(1e-3, 1e2), "model__l1_ratio": uniform(0.0, 1.0)},
    "KNR": {"model__n_neighbors": randint(2, 50), "model__weights": ["uniform","distance"], "model__p":[1,2]},
    "DTR": {"model__max_depth": randint(3, 16), "model__min_samples_leaf": randint(1, 10)},
    "RFR": {"model__n_estimators": randint(200, 600), "model__max_depth": randint(4, 16),
            "model__min_samples_split": randint(2, 20), "model__min_samples_leaf": randint(1, 10),
            "model__max_features": ["sqrt","log2", None], "model__bootstrap": [True, False]},
    "MLP": {"model__alpha": loguniform(1e-4, 1e-1), "model__learning_rate_init": loguniform(1e-4, 1e-2)},
    "XGB": {"model__n_estimators": randint(250, 600), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__max_depth": randint(3, 9), "model__subsample": uniform(0.7, 0.3),
            "model__colsample_bytree": uniform(0.7, 0.3), "model__min_child_weight": randint(1, 6)},
    "LGB": {"model__n_estimators": randint(300, 800), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__num_leaves": randint(16, 128), "model__max_depth": randint(-1, 12),
            "model__min_child_samples": randint(10, 50), "model__subsample": uniform(0.7, 0.3),
            "model__colsample_bytree": uniform(0.7, 0.3), "model__reg_lambda": loguniform(1e-3, 10)},
    "CAT": {"model__iterations": randint(300, 700), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__depth": randint(4, 10), "model__l2_leaf_reg": loguniform(1e-2, 30),
            "model__border_count": randint(32, 255)},
}

# Elegimos qué modelos tunear (puedes ajustar esta lista; dejo los más relevantes)
to_tune = [
    ("RG",  Ridge()),  # Ridge no tiene random_state
    ("EN",  ElasticNet(random_state=RANDOM_STATE, max_iter=5000)),
    ("RFR", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=1)),
]

# Añadimos boosters si están disponibles (según flags del Paso 4)
if 'HAVE_XGB' in globals() and HAVE_XGB:
    to_tune.append(("XGB", XGBRegressor(
        tree_method="hist", random_state=RANDOM_STATE, n_jobs=1, verbosity=0
    )))
if 'HAVE_LGB' in globals() and HAVE_LGB:
    to_tune.append(("LGB", LGBMRegressor(
        random_state=RANDOM_STATE, n_jobs=1, verbosity=-1, force_row_wise=True
    )))
if 'HAVE_CAT' in globals() and HAVE_CAT:
    to_tune.append(("CAT", CatBoostRegressor(
        random_state=RANDOM_STATE, verbose=False, allow_writing_files=False, thread_count=1
    )))

refit_metric = "rmse"  # minimizamos RMSE
scoring = {"rmse": "neg_root_mean_squared_error", "mae": "neg_mean_absolute_error", "r2": "r2"}

best_models = []
cache_dir = tempfile.mkdtemp(prefix="skcache_")
try:
    for name, base_model in to_tune:
        pipe = build_pipe(base_model)
        # intentar cachear transformaciones (si fuera sklearn.Pipeline); en imblearn no aplica
        try:
            pipe.set_params(memory=cache_dir)
        except Exception:
            pass

        heavy = name in ["RFR", "XGB", "LGB", "CAT"]
        search = RandomizedSearchCV(
            pipe, param_spaces[name],
            n_iter=(15 if heavy else 12),
            cv=(cv_heavy if heavy else cv_light),
            scoring=scoring, refit=refit_metric,
            n_jobs=-1, random_state=RANDOM_STATE, verbose=0,
            error_score=np.nan, return_train_score=False
        )
        search.fit(X_train, y_train)
        # Guardamos: nombre, mejor pipeline, RMSE CV positivo, y params ganadores
        best_models.append((name, search.best_estimator_, -search.best_score_, search.best_params_))

    # ordenamos por menor RMSE CV
    best_models.sort(key=lambda x: x[2])
    best_name, final_pipe_opt, best_cv_rmse, best_params = best_models[0]
    # print mínimo (puedes comentar esta línea si quieres 100% silencio)
    print(f">>> GANADOR OPTIMIZADO: {best_name} (RMSE CV={best_cv_rmse:.3f})")
finally:
    shutil.rmtree(cache_dir, ignore_errors=True)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


>>> GANADOR OPTIMIZADO: EN (RMSE CV=1080759.666)


In [10]:
# =========================================
# 7) Comparación justa (solo CV) - baseline vs ganador
# =========================================
from sklearn.model_selection import KFold, cross_validate

same_cv = KFold(n_splits=5, shuffle=True, random_state=123)

pipe_baseline_best = build_pipe(baseline_best_model)  # del Paso 5
pipe_tuned_best    = final_pipe_opt                   # del Paso 6

def cv_rmse(pipe, name):
    s = cross_validate(
        pipe, X_train, y_train, cv=same_cv,
        scoring={"rmse":"neg_root_mean_squared_error"},
        n_jobs=-1
    )
    rmse = -s["test_rmse"].mean()
    print(f"{name}: RMSE {rmse:.4f}")
    return rmse

rmse_base = cv_rmse(pipe_baseline_best, f"Baseline({baseline_best_name})")
rmse_tune = cv_rmse(pipe_tuned_best,   f"Tuned({best_name})")

# Regla: si la mejora < 1% del RMSE base, nos quedamos con el baseline (más simple)
if (rmse_base - rmse_tune) / rmse_base >= 0.01:
    winner_name, winner_pipe = best_name, pipe_tuned_best
else:
    winner_name, winner_pipe = baseline_best_name, pipe_baseline_best

print(f">>> Modelo seleccionado para TEST: {winner_name}")


Baseline(RG): RMSE 1074381.5158
Tuned(EN): RMSE 1073456.8307
>>> Modelo seleccionado para TEST: RG


In [11]:
# =========================================
# 8) Política de decisión (mínima)
# =========================================
POLICY = {
    "clip_to_train_range": True,   # recorta predicciones al rango visto en TRAIN
    "round_to_int": False,         # para 'price' normalmente False; cámbialo si quisieras enteros
    "lower": float(y_train.min()),
    "upper": float(y_train.max()),
}
print("Política:", POLICY)

def postprocess_preds(yhat, policy=POLICY):
    # yhat puede ser np.array o pd.Series
    ypp = np.array(yhat, copy=True)
    if policy.get("clip_to_train_range", False):
        ypp = np.clip(ypp, policy["lower"], policy["upper"])
    if policy.get("round_to_int", False):
        ypp = np.rint(ypp).astype(int)
    return ypp


Política: {'clip_to_train_range': True, 'round_to_int': False, 'lower': 1750000.0, 'upper': 13300000.0}


In [15]:
# =========================================
# 9) Evaluación final en TEST
# =========================================
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# winner_pipe viene del Paso 7 (RG en tu corrida)
winner_pipe.fit(X_train, y_train)

# Predicción y postproceso según POLICY
y_pred = winner_pipe.predict(X_test)
y_pp   = postprocess_preds(y_pred, POLICY)

# Métricas en TEST
rmse = mean_squared_error(y_test, y_pp, squared=False)
mae  = mean_absolute_error(y_test, y_pp)
r2   = r2_score(y_test, y_pp)

print(f"TEST → RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")

# Vistazo rápido (primeros 10)
import pandas as pd
formatters = {
    "y_true": lambda v: f"{v:,.0f}",
    "y_pred": lambda v: f"{v:,.0f}",   # sin postproceso (por si quieres comparar)
    "y_pp":   lambda v: f"{v:,.0f}",      # con postproceso/policy aplicada
}
print(preview.head(10).to_string(index=False, formatters=formatters))


TEST → RMSE: 1141352.9511 | MAE: 866833.4836 | R²: 0.5501
   y_true    y_pred      y_pp
3,010,000 3,342,386 3,342,386
3,570,000 2,553,527 2,553,527
4,340,000 6,379,273 6,379,273
2,450,000 3,537,262 3,537,262
5,950,000 6,633,327 6,633,327
3,010,000 3,165,220 3,165,220
5,110,000 5,647,763 5,647,763
4,550,000 3,633,258 3,633,258
3,850,000 5,536,154 5,536,154
5,320,000 6,053,505 6,053,505


In [17]:
import numpy as np, pandas as pd
np.set_printoptions(suppress=True)
pd.options.display.float_format = "{:,.2f}".format
# =========================================
# 10) Interpretabilidad + breve error analysis (mínimo, FIX)
# =========================================
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error

# --- 10.1 ¿Cuánto recorta la política? ---
raw_pred = winner_pipe.predict(X_test)
clip_low  = (raw_pred < POLICY["lower"]).mean()
clip_high = (raw_pred > POLICY["upper"]).mean()
print(f"[Policy] clipped_low: {clip_low:.3%} | clipped_high: {clip_high:.3%}")

# --- 10.2 Importancias por Permutación (sobre columnas ORIGINALES) ---
# Se permutan columnas de X_test tal cual las ve el preprocesador del pipeline.
r = permutation_importance(
    estimator=winner_pipe,   # pipeline completa (prep + var0 + model)
    X=X_test, y=y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring="neg_root_mean_squared_error"
)

feat_names = X_test.columns  # nombres originales de entrada
imp = (pd.DataFrame({
        "feature": feat_names,
        "importance": r.importances_mean,
        "std": r.importances_std
     })
     .sort_values("importance", ascending=False)
     .head(15)
)
print("\nTop-15 importancias (perm, columnas originales):")
print(imp.to_string(index=False))

# --- 10.3 Errores: resumen + peores casos ---
y_hat = winner_pipe.predict(X_test)
y_pp  = postprocess_preds(y_hat, POLICY)
res   = pd.DataFrame({
    "y_true": y_test.reset_index(drop=True),
    "y_pred": pd.Series(y_pp)
})
res["abs_err"] = (res["y_true"] - res["y_pred"]).abs()

print("\nResumen de |error|:")
print(res["abs_err"].describe(percentiles=[.1,.25,.5,.75,.9]).to_string())

print("\nPeores 10 casos (|error| alto):")
top_bad_idx = res["abs_err"].nlargest(10).index
print(pd.concat([res.loc[top_bad_idx], X_test.reset_index(drop=True).loc[top_bad_idx]], axis=1)
      .to_string(index=False))

# --- 10.4 Métricas por subgrupos (usando price_clase binaria que definimos en Paso 1) ---
if "price_clase" in df.columns:
    by_cls = (pd.concat([df.loc[X_test.index, "price_clase"].reset_index(drop=True).rename("price_clase"), res], axis=1)
              .groupby("price_clase")["abs_err"]
              .agg(["count","mean","median"])
              .sort_index())
    print("\nMAE por price_clase:")
    print(by_cls.to_string())

[Policy] clipped_low: 0.000% | clipped_high: 0.000%

Top-15 importancias (perm, columnas originales):
         feature  importance       std
            area  182,344.97 31,838.60
       bathrooms  138,846.63 37,007.56
         stories   94,767.63 35,128.32
        prefarea   65,560.80 22,681.02
 airconditioning   60,723.48 53,230.88
         parking   52,479.10 18,063.67
        basement   36,486.61  9,352.47
 hotwaterheating   28,435.66 13,993.86
       guestroom   18,102.43  8,385.11
        mainroad   17,821.57 15,938.97
        bedrooms   -2,385.15  9,968.92
furnishingstatus   -3,750.82 27,719.17

Resumen de |error|:
count         109.00
mean      866,833.48
std       745,916.06
min         3,031.60
10%       123,765.02
25%       356,997.61
50%       691,088.47
75%     1,038,287.91
90%     2,063,355.89
max     3,509,401.43

Peores 10 casos (|error| alto):
 y_true       y_pred      abs_err  area  bedrooms  bathrooms  stories mainroad guestroom basement hotwaterheating airconditioni

In [18]:
# --- 10.5 Importancias por permutación a NIVEL OHE (alineadas con VarThreshold) ---
# Aquí medimos sobre el espacio de features que realmente llegan al modelo (prep -> var0).
try:
    prep = winner_pipe.named_steps["prep"]
    var0 = winner_pipe.named_steps.get("var0", None)
    model = winner_pipe.named_steps["model"]

    # 1) Transformación hasta antes del modelo
    X_prep = prep.transform(X_test)              # después de ColumnTransformer (OHE + escala)
    feat_names_ohe_full = prep.get_feature_names_out()  # nombres OHE (num + cat expandido)

    # 2) Aplicar el selector de varianza si existe (para alinear con lo que ve el modelo)
    if var0 is not None:
        support_mask = var0.get_support()
        X_model_in   = var0.transform(X_prep)
        feat_names_ohe = np.array(feat_names_ohe_full)[support_mask]
    else:
        X_model_in   = X_prep
        feat_names_ohe = feat_names_ohe_full

    # 3) Permutation importance en el espacio OHE ya filtrado
    r2 = permutation_importance(
        estimator=model,
        X=X_model_in, y=y_test,
        n_repeats=10,
        random_state=RANDOM_STATE,
        scoring="neg_root_mean_squared_error"
    )
    imp_ohe = (pd.DataFrame({
                "feature": feat_names_ohe,
                "importance": r2.importances_mean,
                "std": r2.importances_std
             })
             .sort_values("importance", ascending=False)
             .head(20))
    print("\nTop-20 importancias (perm, espacio OHE/model):")
    print(imp_ohe.to_string(index=False))
except Exception as e:
    print(f"\n[AVISO] No se pudo calcular importancias OHE detalladas: {e}")


Top-20 importancias (perm, espacio OHE/model):
                             feature  importance       std
                           num__area  182,344.97 31,838.60
                      num__bathrooms  138,846.63 37,007.56
                        num__stories   94,767.63 35,128.32
                        num__parking   52,479.10 18,063.67
                    cat__basement_no   15,260.47  4,523.72
                   cat__basement_yes   15,260.47  4,523.72
                    cat__prefarea_no   14,152.62 11,556.02
                   cat__prefarea_yes   14,152.62 11,556.02
            cat__hotwaterheating_yes    9,952.86  6,541.29
             cat__hotwaterheating_no    9,952.86  6,541.29
                  cat__guestroom_yes    6,497.25  4,124.01
                   cat__guestroom_no    6,497.25  4,124.01
                    cat__mainroad_no    4,398.90  7,844.16
                   cat__mainroad_yes    4,398.90  7,844.16
cat__furnishingstatus_semi-furnished    1,133.46  5,995.07
        

In [19]:
imp_ohe

Unnamed: 0,feature,importance,std
0,num__area,182344.97,31838.6
2,num__bathrooms,138846.63,37007.56
3,num__stories,94767.63,35128.32
4,num__parking,52479.1,18063.67
9,cat__basement_no,15260.47,4523.72
10,cat__basement_yes,15260.47,4523.72
15,cat__prefarea_no,14152.62,11556.02
16,cat__prefarea_yes,14152.62,11556.02
12,cat__hotwaterheating_yes,9952.86,6541.29
11,cat__hotwaterheating_no,9952.86,6541.29
