In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, roc_auc_score, classification_report
)

import mlflow
import mlflow.sklearn

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 100)


In [2]:
# Ruta al CSV desde la carpeta del notebook
data_path = "../data/listings_clean_core_eda.csv"

df = pd.read_csv(data_path)

df.shape, df.head()


((20234, 13),
        id   price  price_per_guest  accommodates        room_type  \
 0   35797  3799.0        1899.5000           2.0  Entire home/apt   
 1   56074   585.0         292.5000           2.0  Entire home/apt   
 2   67703  1696.0         424.0000           4.0  Entire home/apt   
 3   70644  1004.0         502.0000           2.0  Entire home/apt   
 4  165772  4071.0         254.4375          16.0  Entire home/apt   
 
   neighbourhood_cleansed  minimum_nights  availability_365  \
 0  Cuajimalpa de Morelos             1.0             364.0   
 1             Cuauhtémoc            15.0             338.0   
 2             Cuauhtémoc             2.0             267.0   
 3               Coyoacán             3.0             211.0   
 4         Miguel Hidalgo             2.0             177.0   
 
    estimated_occupancy_l365d  estimated_revenue_l365d host_is_superhost  \
 0                        0.0                      0.0                 f   
 1                       30.0   

In [3]:
# Limpieza básica: quitar filas con NA
df = df.dropna().copy()
print("Filas después de eliminar NA:", df.shape[0])

# Convertir host_is_superhost a 0/1 (False/True)
df["host_is_superhost"] = (df["host_is_superhost"] == "t").astype(int)

# --------
# Target de REGRESIÓN: precio en log
# --------
df["price_log"] = np.log1p(df["price"])  # log(1 + price) para estabilizar

# --------
# Target de CLASIFICACIÓN: alta ocupación
#     alta_ocupación = ocupación > mediana del histórico
# --------
umbral_ocupacion = df["estimated_occupancy_l365d"].median()
df["high_occupancy"] = (df["estimated_occupancy_l365d"] > umbral_ocupacion).astype(int)

print("Umbral de alta ocupación:", umbral_ocupacion)

target_reg = "price_log"
target_clf = "high_occupancy"


Filas después de eliminar NA: 16521
Umbral de alta ocupación: 90.0


In [4]:
# Columnas que NO serán features (las quitamos de X)
cols_a_excluir = [
    "id",
    "price",
    "price_log",
    "estimated_occupancy_l365d",
    "high_occupancy",
]

feature_cols = [c for c in df.columns if c not in cols_a_excluir]

X = df[feature_cols].copy()
y_reg = df[target_reg].copy()
y_clf = df[target_clf].copy()

print("Features:", feature_cols)
print("Tamaño de X:", X.shape)
print("Tamaño de y_reg:", y_reg.shape)
print("Tamaño de y_clf:", y_clf.shape)


Features: ['price_per_guest', 'accommodates', 'room_type', 'neighbourhood_cleansed', 'minimum_nights', 'availability_365', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Tamaño de X: (16521, 10)
Tamaño de y_reg: (16521,)
Tamaño de y_clf: (16521,)


In [5]:
# Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Columnas numéricas:", numeric_features)
print("Columnas categóricas:", categorical_features)

# Transformador de columnas: escalar numéricas y one-hot a categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


Columnas numéricas: ['price_per_guest', 'accommodates', 'minimum_nights', 'availability_365', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Columnas categóricas: ['room_type', 'neighbourhood_cleansed']


In [6]:
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X,
    y_reg,
    y_clf,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_clf  # estratificar por la variable de clasificación
)

X_train.shape, X_test.shape


((13216, 10), (3305, 10))

In [8]:
# ===== Modelo base de REGRESIÓN (price_log) =====

regressor_base = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", MLPRegressor(
        hidden_layer_sizes=(64, 32),  # 2 capas ocultas
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True
    ))
])

# Registrar en MLflow
mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_regresion")

with mlflow.start_run(run_name="mlp_regresion_base"):
    regressor_base.fit(X_train, y_reg_train)

    # Predicción en escala log
    y_reg_pred_log = regressor_base.predict(X_test)

    # Volver a precio original
    y_reg_test_price = np.expm1(y_reg_test)
    y_reg_pred_price = np.expm1(y_reg_pred_log)

    mae = mean_absolute_error(y_reg_test_price, y_reg_pred_price)

    mse = mean_squared_error(y_reg_test_price, y_reg_pred_price)
    rmse = np.sqrt(mse)

    r2 = r2_score(y_reg_test, y_reg_pred_log)

    mlflow.log_metric("mae_test_precio", mae)
    mlflow.log_metric("rmse_test_precio", rmse)
    mlflow.log_metric("r2_test_logprice", r2)

print("===== Resultados REGRESIÓN (modelo base) =====")
print("MAE (precio):", mae)
print("RMSE (precio):", rmse)
print("R2 (sobre log(price)):", r2)




===== Resultados REGRESIÓN (modelo base) =====
MAE (precio): 40.20372850865186
RMSE (precio): 188.52443674727164
R2 (sobre log(price)): 0.9963083732137799


In [9]:
# ===== Modelo base de CLASIFICACIÓN (high_occupancy) =====

classifier_base = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True
    ))
])


mlflow.set_experiment("etapa4b_andes_clasificacion")

with mlflow.start_run(run_name="mlp_clasificacion_base"):
    classifier_base.fit(X_train, y_clf_train)

    # Probabilidades de clase positiva
    y_clf_proba = classifier_base.predict_proba(X_test)[:, 1]
    # Clasificación usando umbral 0.5
    y_clf_pred = (y_clf_proba >= 0.5).astype(int)

    acc = accuracy_score(y_clf_test, y_clf_pred)
    auc = roc_auc_score(y_clf_test, y_clf_proba)

    mlflow.log_metric("accuracy_test", acc)
    mlflow.log_metric("roc_auc_test", auc)

print("===== Resultados CLASIFICACIÓN (modelo base) =====")
print("Accuracy:", acc)
print("ROC-AUC:", auc)
print("\nReporte de clasificación:\n")
print(classification_report(y_clf_test, y_clf_pred))


2025/11/29 12:49:26 INFO mlflow.tracking.fluent: Experiment with name 'etapa4b_andes_clasificacion' does not exist. Creating a new experiment.


===== Resultados CLASIFICACIÓN (modelo base) =====
Accuracy: 0.9900151285930409
ROC-AUC: 0.9994043023665875

Reporte de clasificación:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1706
           1       0.99      0.99      0.99      1599

    accuracy                           0.99      3305
   macro avg       0.99      0.99      0.99      3305
weighted avg       0.99      0.99      0.99      3305



In [10]:
from sklearn.model_selection import RandomizedSearchCV

# ===== Búsqueda de hiperparámetros - REGRESIÓN =====

param_dist_reg = {
    "model__hidden_layer_sizes": [
        (32, 16),
        (64, 32),
        (128, 64),
        (64, 64, 32),
    ],
    "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
    "model__batch_size": [32, 64, 128],
}

regressor_tuning = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", MLPRegressor(
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True
    ))
])

random_search_reg = RandomizedSearchCV(
    regressor_tuning,
    param_distributions=param_dist_reg,
    n_iter=10,                
    scoring="neg_mean_squared_error",
    cv=3,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=2
)


mlflow.set_experiment("etapa4b_andes_regresion")

with mlflow.start_run(run_name="mlp_regresion_tuning"):
    random_search_reg.fit(X_train, y_reg_train)

best_reg_model = random_search_reg.best_estimator_
print("Mejores hiperparámetros (regresión):")
print(random_search_reg.best_params_)

# Evaluar modelo ajustado en test
y_reg_pred_log_best = best_reg_model.predict(X_test)
y_reg_pred_price_best = np.expm1(y_reg_pred_log_best)

y_reg_test_price = np.expm1(y_reg_test)

mae_best = mean_absolute_error(y_reg_test_price, y_reg_pred_price_best)
mse_best = mean_squared_error(y_reg_test_price, y_reg_pred_price_best)
rmse_best = np.sqrt(mse_best)

r2_best = r2_score(y_reg_test, y_reg_pred_log_best)

print("\n===== Resultados REGRESIÓN (modelo ajustado) =====")
print("MAE (precio):", mae_best)
print("RMSE (precio):", rmse_best)
print("R2 (sobre log(price)):", r2_best)

# logear en MLflow
mlflow.log_metric("mae_test_precio_best", mae_best)
mlflow.log_metric("rmse_test_precio_best", rmse_best)
mlflow.log_metric("r2_test_logprice_best", r2_best)




Fitting 3 folds for each of 10 candidates, totalling 30 fits


2025/11/29 13:04:01 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


Mejores hiperparámetros (regresión):
{'model__learning_rate_init': 0.0005, 'model__hidden_layer_sizes': (64, 64, 32), 'model__batch_size': 32, 'model__alpha': 0.001}

===== Resultados REGRESIÓN (modelo ajustado) =====
MAE (precio): 29.16857528996965
RMSE (precio): 72.29617425130937
R2 (sobre log(price)): 0.9979777293461255


In [12]:
import mlflow
mlflow.end_run()


In [13]:
from sklearn.model_selection import RandomizedSearchCV

# ===== Búsqueda de hiperparámetros - CLASIFICACIÓN =====

param_dist_clf = {
    "model__hidden_layer_sizes": [
        (32, 16),
        (64, 32),
        (128, 64),
        (64, 64, 32),
    ],
    "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
    "model__batch_size": [32, 64, 128],
}

clf_tuning = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", MLPClassifier(
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True
    ))
])

random_search_clf = RandomizedSearchCV(
    clf_tuning,
    param_distributions=param_dist_clf,
    n_iter=10,               
    scoring="roc_auc",
    cv=3,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=2
)


mlflow.set_experiment("etapa4b_andes_clasificacion")

with mlflow.start_run(run_name="mlp_clasificacion_tuning"):
    random_search_clf.fit(X_train, y_clf_train)

best_clf_model = random_search_clf.best_estimator_

print("Mejores hiperparámetros (clasificación):")
print(random_search_clf.best_params_)

# Evaluar modelo ajustado en test
y_clf_proba_best = best_clf_model.predict_proba(X_test)[:, 1]
y_clf_pred_best = (y_clf_proba_best >= 0.5).astype(int)

acc_best = accuracy_score(y_clf_test, y_clf_pred_best)
auc_best = roc_auc_score(y_clf_test, y_clf_proba_best)

print("\n===== Resultados CLASIFICACIÓN (modelo ajustado) =====")
print("Accuracy:", acc_best)
print("ROC-AUC:", auc_best)
print("\nReporte de clasificación (modelo ajustado):\n")
print(classification_report(y_clf_test, y_clf_pred_best))

# logear en MLflow
mlflow.log_metric("accuracy_test_best", acc_best)
mlflow.log_metric("roc_auc_test_best", auc_best)




Fitting 3 folds for each of 10 candidates, totalling 30 fits


2025/11/29 13:19:01 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


Mejores hiperparámetros (clasificación):
{'model__learning_rate_init': 0.001, 'model__hidden_layer_sizes': (64, 32), 'model__batch_size': 128, 'model__alpha': 0.0001}

===== Resultados CLASIFICACIÓN (modelo ajustado) =====
Accuracy: 0.9854765506807867
ROC-AUC: 0.9989152804324507

Reporte de clasificación (modelo ajustado):

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1706
           1       0.98      0.99      0.99      1599

    accuracy                           0.99      3305
   macro avg       0.99      0.99      0.99      3305
weighted avg       0.99      0.99      0.99      3305



In [14]:
# Resumen de métricas clave de la Etapa 4b

resumen_resultados = []


resumen_resultados.append({
    "modelo": "Regresión - base",
    "tipo": "regresión",
    "MAE": mae,
    "RMSE": rmse,
    "R2_log_price": r2,
    "Accuracy": None,
    "ROC_AUC": None,
})

resumen_resultados.append({
    "modelo": "Regresión - ajustado",
    "tipo": "regresión",
    "MAE": mae_best,
    "RMSE": rmse_best,
    "R2_log_price": r2_best,
    "Accuracy": None,
    "ROC_AUC": None,
})

resumen_resultados.append({
    "modelo": "Clasificación - base",
    "tipo": "clasificación",
    "MAE": None,
    "RMSE": None,
    "R2_log_price": None,
    "Accuracy": acc,
    "ROC_AUC": auc,
})

resumen_resultados.append({
    "modelo": "Clasificación - ajustado",
    "tipo": "clasificación",
    "MAE": None,
    "RMSE": None,
    "R2_log_price": None,
    "Accuracy": acc_best,
    "ROC_AUC": auc_best,
})

resumen_df = pd.DataFrame(resumen_resultados)
resumen_df


Unnamed: 0,modelo,tipo,MAE,RMSE,R2_log_price,Accuracy,ROC_AUC
0,Regresión - base,regresión,40.203729,188.524437,0.996308,,
1,Regresión - ajustado,regresión,29.168575,72.296174,0.997978,,
2,Clasificación - base,clasificación,,,,0.990015,0.999404
3,Clasificación - ajustado,clasificación,,,,0.985477,0.998915
