In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pgbm_nb
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/sydney_house/SydneyHousePrices.csv')

In [3]:
x = df.drop(columns = ['Date', 'Id', 'sellPrice'])
x = pd.get_dummies(x)
y = np.log10(df[['sellPrice']])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((159603, 697), (39901, 697))

In [6]:
def mseloss_objective(yhat, y, sample_weight=None):
    gradient = (yhat - y)
    hessian = np.ones_like(yhat)
    return gradient, hessian


def rmseloss_metric(yhat, y, sample_weight=None):
    loss = np.sqrt(np.mean(np.square(yhat - y)))
    return loss

In [7]:
model = pgbm_nb.PGBM()

params = {
    'min_split_gain':0,
    'min_data_in_leaf':2,
    'max_leaves':8,
    'max_bin':64,
    'learning_rate':0.1,
    'verbose':2,
    'early_stopping_rounds':200,
    'feature_fraction':1,
    'bagging_fraction':1,
    'seed':RANDOM_SEED,
    'reg_lambda':1,
    'device':'gpu',
    'gpu_device_id':0,
    'derivatives':'exact',
    'distribution':'normal',
    'n_estimators': 2000
}

In [8]:
model.train(
    train_set=(x_train.values, y_train.values), 
    objective=mseloss_objective, 
    metric=rmseloss_metric, 
    valid_set=(x_val.values, y_val.values),
    params=params
)

Estimator 0/2000, Train metric: 0.2415, Validation metric: 0.2413
Estimator 1/2000, Train metric: 0.2330, Validation metric: 0.2329
Estimator 2/2000, Train metric: 0.2258, Validation metric: 0.2257
Estimator 3/2000, Train metric: 0.2196, Validation metric: 0.2196
Estimator 4/2000, Train metric: 0.2143, Validation metric: 0.2143
Estimator 5/2000, Train metric: 0.2096, Validation metric: 0.2097
Estimator 6/2000, Train metric: 0.2058, Validation metric: 0.2059
Estimator 7/2000, Train metric: 0.2024, Validation metric: 0.2025
Estimator 8/2000, Train metric: 0.1995, Validation metric: 0.1996
Estimator 9/2000, Train metric: 0.1970, Validation metric: 0.1972
Estimator 10/2000, Train metric: 0.1949, Validation metric: 0.1951
Estimator 11/2000, Train metric: 0.1930, Validation metric: 0.1932
Estimator 12/2000, Train metric: 0.1913, Validation metric: 0.1916
Estimator 13/2000, Train metric: 0.1899, Validation metric: 0.1901
Estimator 14/2000, Train metric: 0.1885, Validation metric: 0.1888
Estim

In [9]:
model.optimize_distribution(x_val.values, y_val.values.reshape(-1))

CRPS: 0.19 (Distribution: normal, Tree correlation: 0.000)
CRPS: 0.14 (Distribution: normal, Tree correlation: 0.010)
CRPS: 0.11 (Distribution: normal, Tree correlation: 0.020)
CRPS: 0.09 (Distribution: normal, Tree correlation: 0.030)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.040)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.050)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.060)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.070)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.080)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.090)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.100)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.110)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.120)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.130)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.140)
CRPS: 0.08 (Distribution: normal, Tree correlation: 0.150)
CRPS: 0.09 (Distribution: normal, Tree correlation: 0.16

('logistic', 0.06)

In [10]:
y_train_dist = model.predict_dist(x_train.values, n_forecasts=1000)
y_val_dist = model.predict_dist(x_val.values, n_forecasts=1000)
y_test_dist = model.predict_dist(x_test.values, n_forecasts=1000)

In [11]:
print(model.crps_ensemble(y_train_dist, y_train.values.reshape(-1)).mean())
print(model.crps_ensemble(y_val_dist, y_val.values.reshape(-1)).mean())
print(model.crps_ensemble(y_test_dist, y_test.values.reshape(-1)).mean())

0.07620389661486603
0.0762696452176025
0.07580891578187751
