In [1]:
import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pgbm_nb
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split

from src.probabilistic_flow_boosting.pipelines.modeling.utils import setup_random_seed

RANDOM_SEED = 1

setup_random_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('data/01_raw/CatData/wine_reviews/winemag-data_first150k.csv', index_col=0)
df['country'] = df['country'].fillna('')
df['province'] = df['province'].fillna('')
df = df.dropna(subset = ['price'])

In [3]:
x = df.drop(columns = ['description', 'price', 'designation', 'region_1', 'region_2', 'winery'])
x = pd.get_dummies(x)
y = df[['price']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=RANDOM_SEED)
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=RANDOM_SEED)

In [5]:
x_train.shape, x_test.shape

((109788, 1114), (27447, 1114))

In [6]:
def mseloss_objective(yhat, y, sample_weight=None):
    gradient = (yhat - y)
    hessian = np.ones_like(yhat)
    return gradient, hessian


def rmseloss_metric(yhat, y, sample_weight=None):
    loss = np.sqrt(np.mean(np.square(yhat - y)))
    return loss

In [7]:
model = pgbm_nb.PGBM()

params = {
    'min_split_gain':0,
    'min_data_in_leaf':2,
    'max_leaves':8,
    'max_bin':64,
    'learning_rate':0.1,
    'verbose':2,
    'early_stopping_rounds':200,
    'feature_fraction':1,
    'bagging_fraction':1,
    'seed':RANDOM_SEED,
    'reg_lambda':1,
    'device':'gpu',
    'gpu_device_id':0,
    'derivatives':'exact',
    'distribution':'normal',
    'n_estimators': 2000
}

In [8]:
model.train(
    train_set=(x_train.values, y_train.values), 
    objective=mseloss_objective, 
    metric=rmseloss_metric, 
    valid_set=(x_val.values, y_val.values),
    params=params
)

Estimator 0/2000, Train metric: 35.5933, Validation metric: 35.8299
Estimator 1/2000, Train metric: 34.6437, Validation metric: 34.8117
Estimator 2/2000, Train metric: 33.8359, Validation metric: 34.0112
Estimator 3/2000, Train metric: 33.1471, Validation metric: 33.3208
Estimator 4/2000, Train metric: 32.4774, Validation metric: 32.7104
Estimator 5/2000, Train metric: 31.9447, Validation metric: 32.1193
Estimator 6/2000, Train metric: 31.5099, Validation metric: 31.6324
Estimator 7/2000, Train metric: 31.1139, Validation metric: 31.2217
Estimator 8/2000, Train metric: 30.7894, Validation metric: 30.8906
Estimator 9/2000, Train metric: 30.4739, Validation metric: 30.6510
Estimator 10/2000, Train metric: 30.1860, Validation metric: 30.4219
Estimator 11/2000, Train metric: 29.9003, Validation metric: 29.9455
Estimator 12/2000, Train metric: 29.7013, Validation metric: 29.7166
Estimator 13/2000, Train metric: 29.4922, Validation metric: 29.4775
Estimator 14/2000, Train metric: 29.3376, Va

In [9]:
model.optimize_distribution(x_val.values, y_val.values.reshape(-1))

CRPS: 28.48 (Distribution: normal, Tree correlation: 0.000)
CRPS: 21.50 (Distribution: normal, Tree correlation: 0.010)
CRPS: 16.26 (Distribution: normal, Tree correlation: 0.020)
CRPS: 12.95 (Distribution: normal, Tree correlation: 0.030)
CRPS: 11.14 (Distribution: normal, Tree correlation: 0.040)
CRPS: 10.18 (Distribution: normal, Tree correlation: 0.050)
CRPS: 9.65 (Distribution: normal, Tree correlation: 0.060)
CRPS: 9.36 (Distribution: normal, Tree correlation: 0.070)
CRPS: 9.19 (Distribution: normal, Tree correlation: 0.080)
CRPS: 9.11 (Distribution: normal, Tree correlation: 0.090)
CRPS: 9.07 (Distribution: normal, Tree correlation: 0.100)
CRPS: 9.07 (Distribution: normal, Tree correlation: 0.110)
CRPS: 9.09 (Distribution: normal, Tree correlation: 0.120)
CRPS: 9.12 (Distribution: normal, Tree correlation: 0.130)
CRPS: 9.17 (Distribution: normal, Tree correlation: 0.140)
CRPS: 9.21 (Distribution: normal, Tree correlation: 0.150)
CRPS: 9.26 (Distribution: normal, Tree correlation

('gumbel', 0.1)

In [10]:
y_train_dist = model.predict_dist(x_train.values, n_forecasts=1000)
y_val_dist = model.predict_dist(x_val.values, n_forecasts=1000)
y_test_dist = model.predict_dist(x_test.values, n_forecasts=1000)

In [11]:
print(model.crps_ensemble(y_train_dist, y_train.values.reshape(-1)).mean())
print(model.crps_ensemble(y_val_dist, y_val.values.reshape(-1)).mean())
print(model.crps_ensemble(y_test_dist, y_test.values.reshape(-1)).mean())

9.159219991680366
8.886825367092108
9.240899956844492
