In [1]:
from mapie.metrics import regression_coverage_score, regression_coverage_score_v2, regression_mean_width_score
from mapie.regression import MapieQuantileRegressor

from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("train_data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]
preprocessor = ExtendedTransformation()
filter = SimpleFilter()
preprocessor.fit(X_train, y_train)
X_processed, y_processed = preprocessor.transform(X_train, y_train)
filter.fit(X_processed, y_processed)
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37
X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)
(20974, 6835)
(20974, 4173)
(20974, 3198)
(20974, 1630)
(20974, 4173)
(20974, 3198)
(20974, 1630)


In [None]:

df_test = pd.read_csv("train_data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

In [None]:
# optimizamos para calcular el quantil medio con mejor precision.

import optuna
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
import sklearn.model_selection
from sklearn.metrics import mean_pinball_loss, make_scorer
def objective(trial):
    x, y = X_filtered, y_filtered.flatten()

    n_estimators = trial.suggest_int("n_estimators", 10, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 5, 32, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    estimator = HistGradientBoostingRegressor(max_iter=n_estimators, 
                                              max_depth=max_depth, 
                                              learning_rate=learning_rate,
                                              loss = "quantile",
                                              quantile = 0.5)

    
    score = make_scorer(mean_pinball_loss, alpha=0.5)
    scoring = sklearn.model_selection.cross_val_score(estimator, x, y, n_jobs=-1, cv=3, scoring=score)

    return scoring.mean()

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

In [None]:
study.best_trial.params

In [None]:
estimator_params = study.best_trial.params
estimator_params['loss'] = "quantile"
estimator_params['quantile'] = 0.5
estimator_params['max_iter'] = estimator_params['n_estimators']
del estimator_params['n_estimators']
estimator_params

In [None]:
estimator = HistGradientBoostingRegressor(**estimator_params)

In [None]:
alpha = 0.2 # 80% de confianza
quantile_params = {"method": "quantile", "cv": "split", "alpha": alpha}

In [None]:
mapie = MapieQuantileRegressor(estimator, **quantile_params)
mapie.fit(
            X_filtered, 
            y_filtered,
            calib_size=0.3,
            random_state=0
        )

In [None]:
y_pred, y_pis = mapie.predict(X_test_filtered)

In [None]:
y_pis[:,0]
preprocessor.inverse_transform(y_pis[:,0])

In [None]:
# convert to original scale
y_mediam = preprocessor.inverse_transform(y_pred.reshape(-1,1))
y_low = preprocessor.inverse_transform(y_pis[:,0])
y_high = preprocessor.inverse_transform(y_pis[:,1])

In [None]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low, y_high)
mean_width = regression_mean_width_score(y_low, y_high)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")

In [None]:
import os
import pickle

CHECKPOINTS_DIR = "checkpoints"
# Save the objects
with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preprocessor, f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "wb") as f:
    pickle.dump(filter, f)

with open(os.path.join(CHECKPOINTS_DIR, "model_with_intervals.pkl"), "wb") as f:
    pickle.dump(mapie, f)

In [None]:
# Load the objects
with open(os.path.join(CHECKPOINTS_DIR, "preprocessor.pkl"), "rb") as f:
    my_preprocessor = pickle.load(f)

with open(os.path.join(CHECKPOINTS_DIR, "filter.pkl"), "rb") as f:
    my_filter = pickle.load(f)

with open(os.path.join(CHECKPOINTS_DIR, "model_with_intervals.pkl"), "rb") as f:
    model_w_intervals = pickle.load(f)

In [None]:
# preprocesamos
X_processed, y_processed = my_preprocessor.transform(X_test, y_test)

In [None]:

# filtramos
X_filtered, y_filtered = my_filter.transform(X_processed, y_processed)

In [None]:
# predecimos
pred, intervals = model_w_intervals.predict(X_filtered)

In [None]:
# transformamos a la escala adecuada.
y_mediam = my_preprocessor.inverse_transform(y_pred.reshape(-1,1))
y_low = my_preprocessor.inverse_transform(y_pis[:,0])
y_high = my_preprocessor.inverse_transform(y_pis[:,1])

In [None]:
# evaluamos su cobertura, para ver si realmente en el 80% de los casos el valor real está dentro del intervalo mostrado.
coverage = regression_coverage_score(y_test, y_low, y_high)
mean_width = regression_mean_width_score(y_low, y_high)

print(f"regresion coverage: {coverage}")
print(f"interval mean width: {mean_width}")