In [54]:
import os
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import lightgbmlss
import optuna
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
from properscoring import crps_gaussian, crps_ensemble
import random
import gpytorch
import tqdm.auto as tqdm
from lightgbmlss.model import *
from lightgbmlss.distributions.Gaussian import *
from drf import drf
from pygam import LinearGAM, s, f
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping, train_GP, ExactGPModel
from torch.utils.data import TensorDataset, DataLoader

SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361072

    

In [55]:
# Create the checkpoint directory if it doesn't exist
os.makedirs('CHECKPOINTS/CLUSTERING', exist_ok=True)
CHECKPOINT_PATH = f'CHECKPOINTS/CLUSTERING/task_{task_id}.pt'

print(f"Task {task_id}")

task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Set the random seed for reproducibility
N_TRIALS=2
N_SAMPLES=100
PATIENCE=40
N_EPOCHS=1000
GP_ITERATIONS=1000
BATCH_SIZE=1024
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# New new implementation
N_CLUSTERS=20
# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)
scaler = StandardScaler()

# transform data to compute the clusters
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_scaled)
distances=[]
mahalanobis_dist=[]
counts=[]
ideal_len=len(kmeans.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances.append(np.abs(np.sum(kmeans.labels_==i)-ideal_len))
    counts.append(np.sum(kmeans.labels_==i))
    mean_k= np.mean(X.loc[kmeans.labels_==i,:], axis=0)
    mahalanobis_dist.append(mahalanobis(mean_k, mean, np.linalg.inv(cov)))

dist_df=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
dist_df=dist_df.sort_values('mahalanobis_dist', ascending=False)
dist_df['cumulative_count']=dist_df['count'].cumsum()
dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
labelss=dist_df.index[0:final+1].to_list()
labels=pd.Series(kmeans.labels_).isin(labelss)
labels.index=X.index
close_index=labels.index[np.where(labels==False)[0]]
far_index=labels.index[np.where(labels==True)[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

# calculate the mean and covariance matrix of the dataset
mean_ = np.mean(X_train, axis=0)
cov_ = np.cov(X_train.T)
scaler_ = StandardScaler()

# transform data to compute the clusters
X_train_scaled = scaler_.fit_transform(X_train)

kmeans_ = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_train_scaled)
distances_=[]
counts_=[]
mahalanobis_dist_=[]
ideal_len_=len(kmeans_.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances_.append(np.abs(np.sum(kmeans_.labels_==i)-ideal_len_))
    counts_.append(np.sum(kmeans_.labels_==i))
    mean_k_= np.mean(X_train.loc[kmeans_.labels_==i,:], axis=0)
    mahalanobis_dist_.append(mahalanobis(mean_k_, mean_, np.linalg.inv(cov_)))

dist_df_=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
dist_df_=dist_df_.sort_values('mahalanobis_dist', ascending=False)
dist_df_['cumulative_count']=dist_df_['count'].cumsum()
dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
labelss_=dist_df_.index[0:final_+1].to_list()
labels_=pd.Series(kmeans_.labels_).isin(labelss_)
labels_.index=X_train.index
close_index_=labels_.index[np.where(labels_==False)[0]]
far_index_=labels_.index[np.where(labels_==True)[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

# Create TensorDatasets for training and validation sets
train__dataset = TensorDataset(X_train__tensor, y_train__tensor)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for training and validation sets
train__loader = DataLoader(train__dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Task 361072


Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


In [56]:
dtrain_ = lgb.Dataset(torch.tensor(X_train_.values, dtype=torch.float32).clone().detach(), label=y_train_.values)

def boosted(trial):

    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
    }
    opt_params = params.copy()
    n_rounds = opt_params["n_estimators"]
    del opt_params["n_estimators"]
    opt_params['feature_pre_filter']=False

    # Use LightGBMLossGuideRegressor for distributional prediction
    boosted_tree_model = LightGBMLSS(Gaussian(stabilization="None", response_fn="exp", loss_fn="nll"))
    boosted_tree_model.train(opt_params, dtrain_, num_boost_round=n_rounds)

    # Predict both the mean and standard deviation
    pred_params=boosted_tree_model.predict(X_val, pred_type="parameters")
    y_val_hat_boost=pred_params['loc']
    y_val_hat_std = pred_params['scale']

    # Calculate the CRPS for each prediction
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_boost[i], sig=y_val_hat_std[i]) for i in range(len(y_val))]

    # Return the mean CRPS as the objective to be minimized
    return np.mean(crps_values)

sampler_boost = optuna.samplers.TPESampler(seed=seed)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)

np.random.seed(seed)
quantiles=list(np.random.uniform(0,1,N_SAMPLES))
def rf(trial):
    params = {'num_trees': trial.suggest_int('num_trees', 100, 500),
        'mtry': trial.suggest_int('mtry', 1, 30),
        'min_node_size': trial.suggest_int('min_node_size', 10, 100)}
    
    drf_model = drf(**params, seed=seed)
    drf_model.fit(X_train_, y_train_)
    
    # Generate a sample from the drf model for each data point
    y_val_hat=drf_model.predict(newdata = X_val, functional = "quantile", quantiles=quantiles)

    # Calculate the CRPS for each prediction
    crps_values = [crps_ensemble(y_val_np[i], y_val_hat.quantile[i].reshape(-1)) for i in range(len(y_val_np))]

    # Return the mean CRPS as the objective to be minimized
    return np.mean(crps_values)

sampler_drf = optuna.samplers.TPESampler(seed=seed)
study_drf = optuna.create_study(sampler=sampler_drf, direction='minimize')
study_drf.optimize(rf, n_trials=N_TRIALS)


def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01, log=True),
            'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
            'num_layer': trial.suggest_int('num_layer', 2, 5),
            'hidden_dim': trial.suggest_int('hidden_dim', 100, 500),
            'resblock': trial.suggest_categorical('resblock', [True, False])}
    params['noise_dim']=params['hidden_dim']

    # Check if CUDA is available and if so, move the tensors and the model to the GPU
    if torch.cuda.is_available():
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'], device="cuda")
    else:
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'])
    
    # Generate a sample from the engression model for each data point
    y_val_hat_engression_samples = [engressor_model.sample(torch.Tensor(np.array([X_val.values[i]])), sample_size=N_SAMPLES) for i in range(len(X_val))]

    # Calculate the CRPS for each prediction
    crps_values = [crps_ensemble(y_val_np[i], np.array(y_val_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_val_np))]

    return np.mean(crps_values)

sampler_engression = optuna.samplers.TPESampler(seed=seed)
study_engression = optuna.create_study(sampler=sampler_engression, direction='minimize')
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)


dtrain = lgb.Dataset(torch.tensor(X_train.values, dtype=torch.float32).clone().detach(), label=y_train.values)
opt_params = study_boost.best_params.copy()
n_rounds = opt_params["n_estimators"]
del opt_params["n_estimators"]
opt_params['feature_pre_filter']=False
# Use LightGBMLossGuideRegressor for distributional prediction
boosted_tree_model = LightGBMLSS(Gaussian(stabilization="None", response_fn="exp", loss_fn="nll"))
boosted_tree_model.train(opt_params, dtrain, num_boost_round=n_rounds)
# Predict both the mean and standard deviation
pred_params=boosted_tree_model.predict(X_test, pred_type="parameters")
y_test_hat_boost=pred_params['loc']
y_test_hat_std = pred_params['scale']
# Calculate the CRPS for each prediction
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_boost[i], sig=y_test_hat_std[i]) for i in range(len(y_test))]
# Return the mean CRPS as the objective to be minimized
CRPS_boosted=np.mean(crps_values)

drf_model=drf(**study_drf.best_params, seed=seed)
drf_model.fit(X_train, y_train)
# Generate a sample from the drf model for each data point
y_test_hat_drf=drf_model.predict(newdata = X_test, functional = "quantile", quantiles=quantiles)
# Calculate the CRPS for each prediction
crps_values = [crps_ensemble(y_test_np[i], y_test_hat_drf.quantile[i].reshape(-1)) for i in range(len(y_test_np))]
# Return the mean CRPS as the objective to be minimized
CRPS_rf=np.mean(crps_values)

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_linreg)
# Calculate the CRPS for each prediction
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_linreg[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_linreg = np.mean(crps_values)

params=study_engression.best_params
params['noise_dim']=params['hidden_dim']
X_train_tensor = torch.Tensor(np.array(X_train))
y_train_tensor = torch.Tensor(np.array(y_train).reshape(-1,1))

# Check if CUDA is available and if so, move the tensors and the model to the GPU
if torch.cuda.is_available():
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'], device="cuda")
else:
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'])
# Generate a sample from the engression model for each data point
y_test_hat_engression_samples = [engressor_model.sample(torch.Tensor(np.array([X_test.values[i]])).cuda() if torch.cuda.is_available() else torch.Tensor(np.array([X_test.values[i]])), sample_size=N_SAMPLES) for i in range(len(X_test))]
# Calculate the CRPS for each prediction
crps_values = [crps_ensemble(y_test_np[i], np.array(y_test_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_test_np))]
CRPS_engression=np.mean(crps_values)

print("CRPS linear regression: ",CRPS_linreg)
print("CRPS boosted trees", CRPS_boosted)
print("CRPS random forest", CRPS_rf)
print("CRPS engression", CRPS_engression)

[I 2024-03-04 19:02:45,213] A new study created in memory with name: no-name-356d681b-1170-40ba-b0c6-b3f0b278d480
[I 2024-03-04 19:02:52,404] Trial 0 finished with value: 6.937608925970823 and parameters: {'learning_rate': 0.0713003929222653, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55}. Best is trial 0 with value: 6.937608925970823.
[I 2024-03-04 19:02:55,046] Trial 1 finished with value: 12.527109038809398 and parameters: {'learning_rate': 0.0006784471913345375, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 0 with value: 6.937608925970823.
[I 2024-03-04 19:02:55,049] A new study created in memory with name: no-name-ea90304b-f111-4642-bd43-36333a33cebc


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015

[I 2024-03-04 19:03:00,524] Trial 0 finished with value: 9.07003089184482 and parameters: {'num_trees': 409, 'mtry': 1, 'min_node_size': 67}. Best is trial 0 with value: 9.07003089184482.


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015

[I 2024-03-04 19:03:08,247] Trial 1 finished with value: 7.275542030588492 and parameters: {'num_trees': 400, 'mtry': 15, 'min_node_size': 30}. Best is trial 1 with value: 7.275542030588492.
[I 2024-03-04 19:03:08,250] A new study created in memory with name: no-name-7a4ea8ea-e886-40fc-89d9-9a08012593dc


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1024.
[Epoch 1 (0%), batch 6] energy-loss: 0.4376,  E(|Y-Yhat|): 1.0720,  E(|Yhat-Yhat'|): 1.2687
[Epoch 100 (84%), batch 6] energy-loss: 0.2423,  E(|Y-Yhat|): 0.4711,  E(|Yhat-Yhat'|): 0.4575

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.1010,  E(|Y-Yhat|): 2.3519,  E(|Yhat-Yhat'|): 2.5017

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-03-04 19:04:49,599] Trial 0 finished with value: 3.041407315526319 and parameters: {'learning_rate': 0.0034885205571560775, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 400, 'resblock': True}. Best is trial 0 with value: 3.041407315526319.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1024.
[Epoch 1 (0%), batch 6] energy-loss: 0.6360,  E(|Y-Yhat|): 0.8290,  E(|Yhat-Yhat'|): 0.3860
[Epoch 100 (13%), batch 6] energy-loss: 0.2307,  E(|Y-Yhat|): 0.4422,  E(|Yhat-Yhat'|): 0.4230
[Epoch 200 (25%), batch 6] energy-loss: 0.2261,  E(|Y-Yhat|): 0.4416,  E(|Yhat-Yhat'|): 0.4311
[Epoch 300 (38%), batch 6] energy-loss: 0.2274,  E(|Y-Yhat|): 0.4418,  E(|Yhat-Yhat'|): 0.4289
[Epoch 400 (51%), batch 6] energy-loss: 0.2288,  E(|Y-Yhat|): 0.4356,  E(|Yhat-Yhat'|): 0.4136
[Epoch 500 (64%), batch 6] energy-loss: 0.2395,  E(|Y-Yhat|): 0.4487,  E(|Yhat-Yhat'|): 0.4183
[Epoch 600 (76%), batch 6] energy-loss: 0.2258,  E(|Y-Yhat|): 0.4368,  E(|Yhat-Yhat'|): 0.4220
[Epoch 700 (89%), batch 6] energy-loss: 0.

[I 2024-03-04 19:06:17,801] Trial 1 finished with value: 4.233781359117593 and parameters: {'learning_rate': 0.0002489577954043506, 'num_epoches': 785, 'num_layer': 2, 'hidden_dim': 135, 'resblock': False}. Best is trial 0 with value: 3.041407315526319.


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015

In [57]:
dtrain_ = lgb.Dataset(torch.tensor(X_train_.values, dtype=torch.float32).clone().detach(), label=y_train_.values)

def boosted(trial):

    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
    }
    opt_params = params.copy()
    n_rounds = opt_params["n_estimators"]
    del opt_params["n_estimators"]
    opt_params['feature_pre_filter']=False

    # Use LightGBMLossGuideRegressor for distributional prediction
    boosted_tree_model = LightGBMLSS(Gaussian(stabilization="None", response_fn="exp", loss_fn="nll"))
    boosted_tree_model.train(opt_params, dtrain_, num_boost_round=n_rounds)

    # Predict both the mean and standard deviation
    pred_params=boosted_tree_model.predict(X_val, pred_type="parameters")
    y_val_hat_boost=pred_params['loc']
    y_val_hat_std = pred_params['scale']

    # Calculate the CRPS for each prediction
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_boost[i], sig=y_val_hat_std[i]) for i in range(len(y_val))]

    # Return the mean CRPS as the objective to be minimized
    return np.mean(crps_values)

sampler_boost = optuna.samplers.TPESampler(seed=seed)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)

np.random.seed(seed)
quantiles=list(np.random.uniform(0,1,N_SAMPLES))
def rf(trial):
    params = {'num_trees': trial.suggest_int('num_trees', 100, 500),
        'mtry': trial.suggest_int('mtry', 1, 30),
        'min_node_size': trial.suggest_int('min_node_size', 10, 100)}
    
    drf_model = drf(**params, seed=seed)
    drf_model.fit(X_train_, y_train_)
    
    # Generate a sample from the drf model for each data point
    y_val_hat=drf_model.predict(newdata = X_val, functional = "quantile", quantiles=quantiles)

    # Calculate the CRPS for each prediction
    crps_values = [crps_ensemble(y_val_np[i], y_val_hat.quantile[i].reshape(-1)) for i in range(len(y_val_np))]

    # Return the mean CRPS as the objective to be minimized
    return np.mean(crps_values)

sampler_drf = optuna.samplers.TPESampler(seed=seed)
study_drf = optuna.create_study(sampler=sampler_drf, direction='minimize')
study_drf.optimize(rf, n_trials=N_TRIALS)


def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01, log=True),
            'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
            'num_layer': trial.suggest_int('num_layer', 2, 5),
            'hidden_dim': trial.suggest_int('hidden_dim', 100, 500),
            'resblock': trial.suggest_categorical('resblock', [True, False])}
    params['noise_dim']=params['hidden_dim']

    # Check if CUDA is available and if so, move the tensors and the model to the GPU
    if torch.cuda.is_available():
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'], device="cuda")
    else:
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'])
    
    # Generate a sample from the engression model for each data point
    y_val_hat_engression_samples = [engressor_model.sample(torch.Tensor(np.array([X_val.values[i]])), sample_size=N_SAMPLES) for i in range(len(X_val))]

    # Calculate the CRPS for each prediction
    crps_values = [crps_ensemble(y_val_np[i], np.array(y_val_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_val_np))]

    return np.mean(crps_values)

sampler_engression = optuna.samplers.TPESampler(seed=seed)
study_engression = optuna.create_study(sampler=sampler_engression, direction='minimize')
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)


dtrain = lgb.Dataset(torch.tensor(X_train.values, dtype=torch.float32).clone().detach(), label=y_train.values)
opt_params = study_boost.best_params.copy()
n_rounds = opt_params["n_estimators"]
del opt_params["n_estimators"]
opt_params['feature_pre_filter']=False
# Use LightGBMLossGuideRegressor for distributional prediction
boosted_tree_model = LightGBMLSS(Gaussian(stabilization="None", response_fn="exp", loss_fn="nll"))
boosted_tree_model.train(opt_params, dtrain, num_boost_round=n_rounds)
# Predict both the mean and standard deviation
pred_params=boosted_tree_model.predict(X_test, pred_type="parameters")
y_test_hat_boost=pred_params['loc']
y_test_hat_std = pred_params['scale']
# Calculate the CRPS for each prediction
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_boost[i], sig=y_test_hat_std[i]) for i in range(len(y_test))]
# Return the mean CRPS as the objective to be minimized
CRPS_boosted=np.mean(crps_values)

drf_model=drf(**study_drf.best_params, seed=seed)
drf_model.fit(X_train, y_train)
# Generate a sample from the drf model for each data point
y_test_hat_drf=drf_model.predict(newdata = X_test, functional = "quantile", quantiles=quantiles)
# Calculate the CRPS for each prediction
crps_values = [crps_ensemble(y_test_np[i], y_test_hat_drf.quantile[i].reshape(-1)) for i in range(len(y_test_np))]
# Return the mean CRPS as the objective to be minimized
CRPS_rf=np.mean(crps_values)

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_linreg)
# Calculate the CRPS for each prediction
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_linreg[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_linreg = np.mean(crps_values)

params=study_engression.best_params
params['noise_dim']=params['hidden_dim']
X_train_tensor = torch.Tensor(np.array(X_train))
y_train_tensor = torch.Tensor(np.array(y_train).reshape(-1,1))

# Check if CUDA is available and if so, move the tensors and the model to the GPU
if torch.cuda.is_available():
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'], device="cuda")
else:
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=BATCH_SIZE, resblock=params['resblock'])
# Generate a sample from the engression model for each data point
y_test_hat_engression_samples = [engressor_model.sample(torch.Tensor(np.array([X_test.values[i]])).cuda() if torch.cuda.is_available() else torch.Tensor(np.array([X_test.values[i]])), sample_size=N_SAMPLES) for i in range(len(X_test))]
# Calculate the CRPS for each prediction
crps_values = [crps_ensemble(y_test_np[i], np.array(y_test_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_test_np))]
CRPS_engression=np.mean(crps_values)

print("CRPS linear regression: ",CRPS_linreg)
print("CRPS boosted trees", CRPS_boosted)
print("CRPS random forest", CRPS_rf)
print("CRPS engression", CRPS_engression)

[I 2024-03-04 19:08:30,657] A new study created in memory with name: no-name-e2a9d29b-e253-4507-af13-0e3a103bf9bd
[I 2024-03-04 19:08:33,473] Trial 0 finished with value: 6.937608925970823 and parameters: {'learning_rate': 0.0713003929222653, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55}. Best is trial 0 with value: 6.937608925970823.
[I 2024-03-04 19:08:36,708] Trial 1 finished with value: 12.527109038809398 and parameters: {'learning_rate': 0.0006784471913345375, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 0 with value: 6.937608925970823.
[I 2024-03-04 19:08:36,711] A new study created in memory with name: no-name-4f377d26-6161-4e4f-8457-c817e189b3ef


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015

[I 2024-03-04 19:08:47,006] Trial 0 finished with value: 9.07003089184482 and parameters: {'num_trees': 409, 'mtry': 1, 'min_node_size': 67}. Best is trial 0 with value: 9.07003089184482.


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015

[I 2024-03-04 19:08:59,240] Trial 1 finished with value: 7.275542030588492 and parameters: {'num_trees': 400, 'mtry': 15, 'min_node_size': 30}. Best is trial 1 with value: 7.275542030588492.
[I 2024-03-04 19:08:59,270] A new study created in memory with name: no-name-53c4582e-504f-440f-9a36-d447215e770f


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1024.
[Epoch 1 (0%), batch 6] energy-loss: 0.4376,  E(|Y-Yhat|): 1.0720,  E(|Yhat-Yhat'|): 1.2687
[Epoch 100 (84%), batch 6] energy-loss: 0.2423,  E(|Y-Yhat|): 0.4711,  E(|Yhat-Yhat'|): 0.4575

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.1010,  E(|Y-Yhat|): 2.3519,  E(|Yhat-Yhat'|): 2.5017

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-03-04 19:11:02,423] Trial 0 finished with value: 3.041407315526319 and parameters: {'learning_rate': 0.0034885205571560775, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 400, 'resblock': True}. Best is trial 0 with value: 3.041407315526319.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1024.
[Epoch 1 (0%), batch 6] energy-loss: 0.6360,  E(|Y-Yhat|): 0.8290,  E(|Yhat-Yhat'|): 0.3860
[Epoch 100 (13%), batch 6] energy-loss: 0.2307,  E(|Y-Yhat|): 0.4422,  E(|Yhat-Yhat'|): 0.4230
[Epoch 200 (25%), batch 6] energy-loss: 0.2261,  E(|Y-Yhat|): 0.4416,  E(|Yhat-Yhat'|): 0.4311
[Epoch 300 (38%), batch 6] energy-loss: 0.2274,  E(|Y-Yhat|): 0.4418,  E(|Yhat-Yhat'|): 0.4289
[Epoch 400 (51%), batch 6] energy-loss: 0.2288,  E(|Y-Yhat|): 0.4356,  E(|Yhat-Yhat'|): 0.4136
[Epoch 500 (64%), batch 6] energy-loss: 0.2395,  E(|Y-Yhat|): 0.4487,  E(|Yhat-Yhat'|): 0.4183
[Epoch 600 (76%), batch 6] energy-loss: 0.2258,  E(|Y-Yhat|): 0.4368,  E(|Yhat-Yhat'|): 0.4220
[Epoch 700 (89%), batch 6] energy-loss: 0.

[I 2024-03-04 19:13:02,384] Trial 1 finished with value: 4.233781359117593 and parameters: {'learning_rate': 0.0002489577954043506, 'num_epoches': 785, 'num_layer': 2, 'hidden_dim': 135, 'resblock': False}. Best is trial 0 with value: 3.041407315526319.


[0.771320643266746, 0.0207519493594015, 0.6336482349262754, 0.7488038825386119, 0.4985070123025904, 0.22479664553084766, 0.19806286475962398, 0.7605307121989587, 0.16911083656253545, 0.08833981417401027, 0.6853598183677972, 0.9533933461949365, 0.003948266327914451, 0.5121922633857766, 0.8126209616521135, 0.6125260668293881, 0.7217553174317995, 0.29187606817063316, 0.9177741225129434, 0.7145757833976906, 0.5425443680112613, 0.14217004760152696, 0.3733407600514692, 0.6741336150663453, 0.4418331744229961, 0.4340139933332937, 0.6177669784693172, 0.5131382425543909, 0.6503971819314672, 0.6010389534045444, 0.8052231968327465, 0.5216471523936341, 0.9086488808086682, 0.3192360889885453, 0.09045934927090737, 0.30070005663620336, 0.11398436186354977, 0.8286813263076767, 0.04689631938924976, 0.6262871483113925, 0.5475861559192435, 0.8192869956700687, 0.1989475396788123, 0.8568503024577332, 0.3516526394320879, 0.7546476915298572, 0.2959617068796787, 0.8839364795611863, 0.3255116378322488, 0.165015