In [34]:
# Importación de librerías necesarias

from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter

import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_pinball_loss, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, make_scorer
from mapie.regression import MapieQuantileRegressor
from mapie.metrics import regression_coverage_score, regression_mean_width_score

In [3]:
# Preprocesamiento de los datos 

df_train = pd.read_csv("train_data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=["Price"]), df_train[["Price"]]

preprocessor = ExtendedTransformation()
filter = SimpleFilter()

preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)

filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)


X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3198)
(20974, 1630)
(20974, 4173)
(20974, 3198)
(20974, 1630)


In [4]:
df_test = pd.read_csv("train_data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]

X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3198)
(8989, 1630)


In [55]:
# Optimización de hiperparámetros
pinball_scorer = make_scorer(mean_pinball_loss, alpha=0.5)

def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)

    model = GradientBoostingRegressor(
        loss="quantile",
        alpha=0.5,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample
    )


    score = cross_val_score(model, x, y, cv=3, scoring=pinball_scorer, n_jobs=-1)


    return score.mean()


In [56]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("Best trial:")
print(study.best_trial)

[I 2025-05-18 20:22:39,692] A new study created in memory with name: no-name-87d5e6dd-5a9d-4291-a334-ed9fe3cdc286
[I 2025-05-18 20:32:13,096] Trial 0 finished with value: 0.26840336321772273 and parameters: {'n_estimators': 395, 'learning_rate': 0.0050277736815868505, 'max_depth': 23, 'min_samples_leaf': 11, 'subsample': 0.6824943890448341}. Best is trial 0 with value: 0.26840336321772273.
[I 2025-05-18 20:32:45,913] Trial 1 finished with value: 0.3817858414148289 and parameters: {'n_estimators': 26, 'learning_rate': 0.010669697996243184, 'max_depth': 26, 'min_samples_leaf': 18, 'subsample': 0.6673320139780574}. Best is trial 0 with value: 0.26840336321772273.
[I 2025-05-18 20:37:56,298] Trial 2 finished with value: 0.2525646050298986 and parameters: {'n_estimators': 195, 'learning_rate': 0.025537615834627384, 'max_depth': 21, 'min_samples_leaf': 7, 'subsample': 0.6546191101937902}. Best is trial 2 with value: 0.2525646050298986.
[I 2025-05-18 20:43:11,326] Trial 3 finished with value:

Best trial:
FrozenTrial(number=2, state=1, values=[0.2525646050298986], datetime_start=datetime.datetime(2025, 5, 18, 20, 32, 45, 915568), datetime_complete=datetime.datetime(2025, 5, 18, 20, 37, 56, 297779), params={'n_estimators': 195, 'learning_rate': 0.025537615834627384, 'max_depth': 21, 'min_samples_leaf': 7, 'subsample': 0.6546191101937902}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=True, low=10, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=32, log=False, low=5, step=1), 'min_samples_leaf': IntDistribution(high=20, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None)}, trial_id=2, value=None)


In [57]:
study.best_trial.params

{'n_estimators': 195,
 'learning_rate': 0.025537615834627384,
 'max_depth': 21,
 'min_samples_leaf': 7,
 'subsample': 0.6546191101937902}

In [58]:
# Configuración del estimador base

final_model = GradientBoostingRegressor(n_estimators=195, max_depth=21, learning_rate=0.025537615834627384, 
                                        min_samples_leaf=7, subsample=0.6546191101937902, loss = "quantile")
final_model.fit(X_filtered, y_filtered.flatten())

In [59]:
# Configuración del modelo mapie basado en quantile regressor

alpha_80 = 0.2  # 80% de confianza
quantile_params_80 = {"method": "quantile", "cv": "split", "alpha": alpha_80}

mapie_80 = MapieQuantileRegressor(final_model, **quantile_params_80)
mapie_80.fit(
    X_filtered,
    y_filtered,
    calib_size=0.2,
    random_state=0
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [60]:
y_pred_80, y_pis_80 = mapie_80.predict(X_test_filtered)

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [61]:
y_pis_80[:,0]
preprocessor.inverse_transform(y_pis_80[:,0])



array([[5650134.79786772],
       [4250995.13600731],
       [5304638.03482538],
       ...,
       [4200000.        ],
       [4956916.88754388],
       [9511929.32079343]])

In [62]:
# convert to original scale
y_median_80 = preprocessor.inverse_transform(y_pred_80.reshape(-1,1))
y_low_80 = preprocessor.inverse_transform(y_pis_80[:,0])
y_high_80 = preprocessor.inverse_transform(y_pis_80[:,1])



In [63]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low_80, y_high_80)
mean_width = regression_mean_width_score(y_low_80, y_high_80)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.7925241962398487
interval mean width: 18203281.215656463


In [64]:
alpha_90 = 0.1  # 90% de confianza
quantile_params_90 = {"method": "quantile", "cv": "split", "alpha": alpha_90}

mapie_90 = MapieQuantileRegressor(final_model, **quantile_params_90)
mapie_90.fit(
    X_filtered,
    y_filtered,
    calib_size=0.2,
    random_state=0
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [65]:
y_pred_90, y_pis_90 = mapie_90.predict(X_test_filtered)

INFO:root:The predictions are ill-sorted.
INFO:root:The predictions are ill-sorted.


In [66]:
y_pis_90[:,0]
preprocessor.inverse_transform(y_pis_90[:,0])



array([[4167948.93491462],
       [3003113.37005417],
       [3800000.        ],
       ...,
       [3200000.        ],
       [3794106.33068332],
       [7694860.41044039]])

In [67]:
# convert to original scale
y_median_90 = preprocessor.inverse_transform(y_pred_90.reshape(-1,1))
y_low_90 = preprocessor.inverse_transform(y_pis_90[:,0])
y_high_90 = preprocessor.inverse_transform(y_pis_90[:,1])



In [68]:
# evaluamos su cobertura, para ver si realmente en el 90% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low_90, y_high_90)
mean_width = regression_mean_width_score(y_low_90, y_high_90)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.8918678384692402
interval mean width: 31812795.702991497


In [69]:
alpha_99 = 0.01  # 99% de confianza
quantile_params_99 = {"method": "quantile", "cv": "split", "alpha": alpha_99}

mapie_99 = MapieQuantileRegressor(final_model, **quantile_params_99)
mapie_99.fit(
    X_filtered,
    y_filtered,
    calib_size=0.2,
    random_state=0
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [70]:
y_pred_99, y_pis_99 = mapie_99.predict(X_test_filtered)

INFO:root:The predictions are ill-sorted.


In [71]:
y_pis_99[:,0]
preprocessor.inverse_transform(y_pis_99[:,0])



array([[2387401.03921279],
       [2121124.70466151],
       [3599723.29228243],
       ...,
       [2400000.        ],
       [2297750.03163396],
       [2791023.34220575]])

In [72]:
# convert to original scale
y_mediam_99 = preprocessor.inverse_transform(y_pred_99.reshape(-1,1))
y_low_99 = preprocessor.inverse_transform(y_pis_99[:,0])
y_high_99 = preprocessor.inverse_transform(y_pis_99[:,1])



In [73]:
# evaluamos su cobertura, para ver si realmente en el 99% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low_99, y_high_99)
mean_width = regression_mean_width_score(y_low_99, y_high_99)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

regresion coverage: 0.9911002336188675
interval mean width: 283496356.99844235


In [75]:
# Guardar los modelos para su exportación a un entorno de serving/inferencia

import os
import pickle

CHECKPOINTS_DIR = "checkpoints"
os.makedirs(CHECKPOINTS_DIR, exist_ok=True)

with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preprocessor, f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "wb") as f:
    pickle.dump(filter, f)

with open(os.path.join(CHECKPOINTS_DIR, "mapie_model_80.pkl"), "wb") as f:
    pickle.dump(mapie_80, f)

with open(os.path.join(CHECKPOINTS_DIR, "mapie_model_90.pkl"), "wb") as f:
    pickle.dump(mapie_90, f)

with open(os.path.join(CHECKPOINTS_DIR, "mapie_model_99.pkl"), "wb") as f:
    pickle.dump(mapie_99, f)
