In [54]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from crepes import WrapRegressor
from crepes.extras import DifficultyEstimator, MondrianCategorizer

import numpy as np

np.random.seed(602211023)

### Carga de datos y división _train_ / _test_ / _calibration_

In [67]:
dataset = fetch_openml(name="house_sales", version=3, parser="auto")

X = dataset.data.values.astype(float)
y = dataset.target.values.astype(float)

y = np.array([(y[i]-y.min())/(y.max()-y.min()) for i in range(len(y))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_test, X_cal, y_test, y_cal = train_test_split(X_test, y_test, test_size=0.5)

print(X_train.size)
print(X_test.size)
print(X_cal.size)

317709
68082
68082


### Entrenar modelo

In [69]:
rf = WrapRegressor(RandomForestRegressor(n_jobs=-1, n_estimators=500, oob_score=True))

rf.fit(X_train, y_train)

display(rf)


WrapRegressor(learner=RandomForestRegressor(n_estimators=500, n_jobs=-1, oob_score=True), calibrated=False)

### Calibrar modelo

In [70]:
rf.calibrate(X_cal, y_cal)

display(rf)

WrapRegressor(learner=RandomForestRegressor(n_estimators=500, n_jobs=-1, oob_score=True), calibrated=True, predictor=ConformalRegressor(fitted=True, normalized=False, mondrian=False))

### Regresión sobre valores de _test_

In [75]:
intervals = rf.predict_int(X_test, confidence=0.99)

display(intervals)

array([[ 0.02076212,  0.14884836],
       [-0.00193508,  0.12615115],
       [-0.010069  ,  0.11801724],
       ...,
       [ 0.01949849,  0.14758473],
       [-0.04973273,  0.07835351],
       [-0.01413094,  0.11395529]])

### Regresión normalizada

In [76]:
learner_prop = rf.learner

de_var = DifficultyEstimator()

de_var.fit(X=X_train, learner=learner_prop, scaler=True)

display(de_var)

mc = MondrianCategorizer()

mc.fit(X_cal, de=de_var, no_bins=20)

display(mc)

rf_mond = WrapRegressor(learner_prop)

rf_mond.calibrate(X_cal, y_cal, mc=mc)

display(rf_mond)

intervals_mond = rf_mond.predict_int(X_test, y_min=0, y_max=1)

display(intervals_mond)


DifficultyEstimator(fitted=True, type=variance, scaler=True, beta=0.01, oob=False)

MondrianCategorizer(fitted=True, de=DifficultyEstimator(fitted=True, type=variance, scaler=True, beta=0.01, oob=False), no_bins=20)

WrapRegressor(learner=RandomForestRegressor(n_estimators=500, n_jobs=-1, oob_score=True), calibrated=True, predictor=ConformalRegressor(fitted=True, normalized=False, mondrian=True))

array([[0.06585493, 0.10375554],
       [0.03615718, 0.08805889],
       [0.04536713, 0.06258111],
       ...,
       [0.05993744, 0.10714578],
       [0.00829137, 0.02032942],
       [0.02658152, 0.07324283]])