In [125]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import lightgbmlss
import optuna
from sklearn.cluster import KMeans
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
from CRPS import CRPS
from properscoring import crps_gaussian, crps_ensemble

In [48]:
# pip install pandas
# pip install setuptools
# pip install openml
# pip install lightgbm
# pip install optuna
# pip install engression


# numpy-1.26.3 pandas-2.1.4 setuptools-69.0.3 openml lightgbm-4.2.0  optuna-3.5.0

In [49]:
#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task = openml.tasks.get_task(361072)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)



In [74]:
# New new implementation
N_CLUSTERS=20
# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)
scaler = StandardScaler()

# transform data to compute the clusters
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_scaled)
distances=[]
mahalanobis_dist=[]
counts=[]
ideal_len=len(kmeans.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances.append(np.abs(np.sum(kmeans.labels_==i)-ideal_len))
    counts.append(np.sum(kmeans.labels_==i))
    mean_k= np.mean(X.loc[kmeans.labels_==i,:], axis=0)
    mahalanobis_dist.append(mahalanobis(mean_k, mean, np.linalg.inv(cov)))

dist_df=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
dist_df=dist_df.sort_values('mahalanobis_dist', ascending=False)
dist_df['cumulative_count']=dist_df['count'].cumsum()
dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
labelss=dist_df.index[0:final+1].to_list()
labels=pd.Series(kmeans.labels_).isin(labelss)
labels.index=X.index
close_index=labels.index[np.where(labels==False)[0]]
far_index=labels.index[np.where(labels==True)[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

# calculate the mean and covariance matrix of the dataset
mean_ = np.mean(X_train, axis=0)
cov_ = np.cov(X_train.T)
scaler_ = StandardScaler()

# transform data to compute the clusters
X_train_scaled = scaler_.fit_transform(X_train)

kmeans_ = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_train_scaled)
distances_=[]
counts_=[]
mahalanobis_dist_=[]
ideal_len_=len(kmeans_.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances_.append(np.abs(np.sum(kmeans_.labels_==i)-ideal_len_))
    counts_.append(np.sum(kmeans_.labels_==i))
    mean_k_= np.mean(X_train.loc[kmeans_.labels_==i,:], axis=0)
    mahalanobis_dist_.append(mahalanobis(mean_k_, mean_, np.linalg.inv(cov_)))

dist_df_=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
dist_df_=dist_df_.sort_values('mahalanobis_dist', ascending=False)
dist_df_['cumulative_count']=dist_df_['count'].cumsum()
dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
labelss_=dist_df_.index[0:final_+1].to_list()
labels_=pd.Series(kmeans_.labels_).isin(labelss_)
labels_.index=X_train.index
close_index_=labels_.index[np.where(labels_==False)[0]]
far_index_=labels_.index[np.where(labels_==True)[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]

#### Gaussian process with stochastic variational inference

In [21]:
import gpytorch
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy

N_TRIALS=2

def SVGP_opt(trial):

    seed=10
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    class SVGPMODEL(gpytorch.models.ApproximateGP):
        def __init__(self, inducing_points):
            variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
            variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=True)
            super(SVGPMODEL, self).__init__(variational_strategy)
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=inducing_points.shape[1]),
                ard_num_dims=inducing_points.shape[1]
            )

        def forward(self, x):
            mean_x = self.mean_module(x)
            covar_x = self.covar_module(x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_.values, dtype=torch.float32)

    # Initialize the Gaussian Process model and likelihood
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = SVGPMODEL(X_train_tensor)

    # Set the model in training mode
    model.train()
    likelihood.train()

    # Define the learning params
    n_epochs=5 #trial.suggest_int('n_epochs', 100, 5000)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)

    # Use the negative log likelihood as the loss
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=y_train_tensor.numel())
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = -mll(output, y_train_tensor)
        loss.backward()
        optimizer.step()

    # Set the model in evaluation mode
    model.eval()
    likelihood.eval()

    # Make predictions on the validation set
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        y_pred = model(torch.tensor(X_val.values, dtype=torch.float32))

    # Calculate CRPS
    y_val_np = y_val.values.flatten()
    y_pred_np = y_pred.mean.numpy().flatten()
    y_pred_std_np = y_pred.stddev.numpy().flatten()
    
    # Calculate the CRPS for each prediction
    crps_values = [crps_gaussian(y_val_np[i], mu=y_pred_np[i], sig=y_pred_std_np[i]) for i in range(len(y_val_np))]

    # Calculate the mean CRPS
    mean_crps= np.mean(crps_values)
    
    return mean_crps

sampler_SVGP = optuna.samplers.TPESampler(seed=10)
study_SVGP = optuna.create_study(sampler=sampler_SVGP, direction='minimize')
study_SVGP.optimize(SVGP_opt, n_trials=N_TRIALS)

[I 2024-01-22 17:10:06,859] A new study created in memory with name: no-name-bea4cf5f-3031-4dec-a045-a078b921cc60


[W 2024-01-22 17:18:36,997] Trial 0 failed with parameters: {'lengthscale': 0.08747537025773001, 'learning_rate': 0.00011376505702653915} because of the following error: AxisError(-1, 0, None).
Traceback (most recent call last):
  File "c:\Users\dalma\Desktop\THESIS_ETH_NEW\CODE\.venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\dalma\AppData\Local\Temp\ipykernel_14344\2994460075.py", line 77, in SVGP_opt
    crps_values = [CRPS(y_val_np[i], pred_distributions[i]) for i in range(len(y_val_np))]
  File "C:\Users\dalma\AppData\Local\Temp\ipykernel_14344\2994460075.py", line 77, in <listcomp>
    crps_values = [CRPS(y_val_np[i], pred_distributions[i]) for i in range(len(y_val_np))]
  File "c:\Users\dalma\Desktop\THESIS_ETH_NEW\CODE\.venv\lib\site-packages\CRPS\CRPS.py", line 109, in __init__
    self.fc = np.sort(ensemble_members)
  File "c:\Users\dalma\Desktop\THESIS_ETH_NEW\CODE\.venv\lib\site-packages\numpy\cor

AxisError: axis -1 is out of bounds for array of dimension 0

In [None]:
# Access the best parameters
best_params_SVGP = study_SVGP.best_params
lengthscale = best_params_SVGP['lengthscale']
n_epochs = best_params_SVGP['n_epochs']
learning_rate = best_params_SVGP['learning_rate']

class SVGPMODEL(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
        variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=True)
        super(SVGPMODEL, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=inducing_points.shape[1], lengthscale=lengthscale),
            ard_num_dims=inducing_points.shape[1]
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Convert data to PyTorch tensors
X_tensor = torch.tensor(X_train, dtype=torch.float32)
y_tensor = torch.tensor(y_train, dtype=torch.float32)

# Initialize the final Gaussian Process model with the best parameters
likelihood = gpytorch.likelihoods.GaussianLikelihood()
final_model = SVGPMODEL(X_tensor)

# Set the model in training mode
final_model.train()
likelihood.train()

# Use the negative log likelihood as the loss
mll = gpytorch.mlls.VariationalELBO(likelihood, final_model, num_data=y_tensor.numel())
optimizer = torch.optim.Adam(final_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(n_epochs):
    optimizer.zero_grad()
    output = final_model(X_tensor)
    loss = -mll(output, y_tensor)
    loss.backward()
    optimizer.step()

# Set the final model in evaluation mode
final_model.eval()
likelihood.eval()

# Make predictions on the validation set
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    y_pred = final_model(torch.tensor(X_test.values, dtype=torch.float32))


# Calculate CRPS
y_test_np = y_test.values.flatten()
y_pred_np = y_pred.mean.numpy().flatten()
y_pred_std_np = y_pred.stddev.numpy().flatten()

# Calculate the CRPS for each prediction
crps_values = [crps_gaussian(y_test_np[i], mu=y_pred_np[i], sig=y_pred_std_np[i]) for i in range(len(y_test_np))]

# Calculate the mean CRPS
CRPS_SVGP= np.mean(crps_values)

print("CRPS SVGP: ", CRPS_SVGP)

NameError: name 'study_SVGP' is not defined

#### MLP

In [80]:
N_TRIALS=2

d_out = 1  
d_in=X_train_.shape[1]

def MLP_opt(trial):

    seed=10
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block = trial.suggest_int("d_block", 10, 500)
    dropout = trial.suggest_float("dropout", 0, 1)

    MLP_model = MLP(
    d_in=d_in,
    d_out=d_out,
    n_blocks=n_blocks,
    d_block=d_block,
    dropout=dropout,
    )
    n_epochs=trial.suggest_int('n_epochs', 100, 5000)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    optimizer=torch.optim.Adam(MLP_model.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()
    loss_Adam=[]

    for i in range(n_epochs):
        # making a pridiction in forward pass
        y_train_hat = MLP_model(torch.Tensor(X_train_.values)).reshape(-1,)
        # calculating the loss between original and predicted data points
        loss = criterion(y_train_hat, torch.Tensor(y_train_.values))
        # store loss into list
        loss_Adam.append(loss.item())
        # zeroing gradients after each iteration
        optimizer.zero_grad()
        # backward pass for computing the gradients of the loss w.r.t to learnable parameters
        loss.backward()
        # updating the parameters after each iteration
        optimizer.step()

    # Point prediction
    y_val_hat_MLP = (MLP_model(torch.Tensor(X_val.values)).reshape(-1,)).detach().numpy()

    # Estimate standard deviation of the prediction error
    std_dev_error = np.std(y_val - y_val_hat_MLP)

    # Calculate the CRPS for each prediction
    y_val_np = y_val.values.flatten()
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_MLP[i], sig=std_dev_error) for i in range(len(y_val_hat_MLP))]

    # Calculate the mean CRPS
    mean_crps = np.mean(crps_values)

    return mean_crps

sampler_MLP = optuna.samplers.TPESampler(seed=10)
study_MLP = optuna.create_study(sampler=sampler_MLP, direction='minimize')
study_MLP.optimize(MLP_opt, n_trials=N_TRIALS)

[I 2024-01-22 18:06:39,734] A new study created in memory with name: no-name-4d486c74-4c97-49f6-a0fa-d30b76dbe392


[I 2024-01-22 18:07:06,770] Trial 0 finished with value: 63.01536570936997 and parameters: {'n_blocks': 4, 'd_block': 20, 'dropout': 0.6336482349262754, 'n_epochs': 3769, 'learning_rate': 0.002215416944953109}. Best is trial 0 with value: 63.01536570936997.
[I 2024-01-22 18:07:21,098] Trial 1 finished with value: 2458.6672925365024 and parameters: {'n_blocks': 2, 'd_block': 107, 'dropout': 0.7605307121989587, 'n_epochs': 928, 'learning_rate': 0.0001731515998646626}. Best is trial 0 with value: 63.01536570936997.


In [82]:
seed=10
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

MLP_model = MLP(
    d_in=d_in,
    d_out=d_out,
    n_blocks=study_MLP.best_params['n_blocks'],
    d_block=study_MLP.best_params['d_block'],
    dropout=study_MLP.best_params['dropout'],
    )
n_epochs=study_MLP.best_params['n_epochs']
learning_rate=study_MLP.best_params['learning_rate']
optimizer=torch.optim.Adam(MLP_model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()
loss_Adam=[]

for i in range(n_epochs):
    # making a pridiction in forward pass
    y_train_hat = MLP_model(torch.Tensor(X_train.values)).reshape(-1,)
    # calculating the loss between original and predicted data points
    loss = criterion(y_train_hat, torch.Tensor(y_train.values))
    # store loss into list
    loss_Adam.append(loss.item())
    # zeroing gradients after each iteration
    optimizer.zero_grad()
    # backward pass for computing the gradients of the loss w.r.t to learnable parameters
    loss.backward()
    # updating the parameters after each iteration
    optimizer.step()

# Point prediction
y_test_hat_MLP = (MLP_model(torch.Tensor(X_test.values)).reshape(-1,)).detach().numpy()

# Estimate standard deviation of the prediction error
std_dev_error = np.std(y_test - y_test_hat_MLP)

# Create a normal distribution for each prediction
pred_distributions = [norm(loc=y_test_hat_MLP[i], scale=std_dev_error) for i in range(len(y_test_hat_MLP))]

# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_MLP[i], sig=std_dev_error) for i in range(len(y_test_hat_MLP))]

# Calculate the mean CRPS
crps_MLP = np.mean(crps_values)

print("CRPS MLP: ", crps_MLP)

CRPS MLP:  48.72480124825327


#### ResNet

In [None]:
N_TRIALS=5

d_out = 1  
d_in=X_train_.shape[1]

def ResNet_opt(trial):

    seed=10
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block = trial.suggest_int("d_block", 10, 500)
    dropout1 = trial.suggest_float("dropout1", 0, 1)
    dropout2 = trial.suggest_float("dropout2", 0, 1)
    d_hidden_multiplier=trial.suggest_float("d_hidden_multiplier", 0.5, 3)

    ResNet_model = ResNet(
    d_in=d_in,
    d_out=d_out,
    n_blocks=n_blocks,
    d_block=d_block,
    d_hidden=None,
    d_hidden_multiplier=d_hidden_multiplier,
    dropout1=dropout1,
    dropout2=dropout2,
    )
    n_epochs=trial.suggest_int('n_epochs', 100, 5000)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    optimizer=torch.optim.Adam(ResNet_model.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()
    loss_Adam=[]

    for i in range(n_epochs):
        # making a pridiction in forward pass
        y_train_hat = ResNet_model(torch.Tensor(X_train_.values)).reshape(-1,)
        # calculating the loss between original and predicted data points
        loss = criterion(y_train_hat, torch.Tensor(y_train_.values))
        # store loss into list
        loss_Adam.append(loss.item())
        # zeroing gradients after each iteration
        optimizer.zero_grad()
        # backward pass for computing the gradients of the loss w.r.t to learnable parameters
        loss.backward()
        # updating the parameters after each iteration
        optimizer.step()

    # Point prediction
    y_val_hat_ResNet = (ResNet_model(torch.Tensor(X_val.values)).reshape(-1,)).detach().numpy()

    # Estimate standard deviation of the prediction error
    std_dev_error = np.std(y_val - y_val_hat_ResNet)

    # Calculate the CRPS for each prediction
    y_val_np = y_val.values.flatten()
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_ResNet[i], sig=std_dev_error) for i in range(len(y_val_hat_ResNet))]

    # Calculate the mean CRPS
    crps_ResNet = np.mean(crps_values)

    return crps_ResNet

sampler_ResNet = optuna.samplers.TPESampler(seed=10)
study_ResNet = optuna.create_study(sampler=sampler_ResNet, direction='minimize')
study_ResNet.optimize(ResNet_opt, n_trials=N_TRIALS)

[I 2023-12-23 18:10:31,640] A new study created in memory with name: no-name-944004cd-5e6d-4aa6-9fe6-432d77df0455


[I 2023-12-23 18:10:52,768] Trial 0 finished with value: 30.78984260559082 and parameters: {'n_blocks': 4, 'd_block': 20, 'dropout1': 0.6336482349262754, 'dropout2': 0.7488038825386119, 'd_hidden_multiplier': 1.7462675307564761, 'n_epochs': 1201, 'learning_rate': 0.009983336951505236}. Best is trial 0 with value: 30.78984260559082.
[I 2023-12-23 18:11:04,384] Trial 1 finished with value: 34.90650177001953 and parameters: {'n_blocks': 4, 'd_block': 93, 'dropout1': 0.08833981417401027, 'dropout2': 0.6853598183677972, 'd_hidden_multiplier': 2.8834833654873413, 'n_epochs': 119, 'learning_rate': 0.02565839394295025}. Best is trial 0 with value: 30.78984260559082.
[I 2023-12-27 08:43:55,096] Trial 2 finished with value: 23.0227108001709 and parameters: {'n_blocks': 5, 'd_block': 310, 'dropout1': 0.7217553174317995, 'dropout2': 0.29187606817063316, 'd_hidden_multiplier': 2.7944353062823586, 'n_epochs': 3602, 'learning_rate': 0.027172963963761936}. Best is trial 2 with value: 23.0227108001709.

In [None]:
seed=10
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

ResNet_model = ResNet(
    d_in=d_in,
    d_out=d_out,
    n_blocks=study_ResNet.best_params['n_blocks'],
    d_block=study_ResNet.best_params['d_block'],
    d_hidden=None,
    d_hidden_multiplier=study_ResNet.best_params['d_hidden_multiplier'],
    dropout1=study_ResNet.best_params['dropout1'],
    dropout2=study_ResNet.best_params['dropout2'],
    )
n_epochs=study_ResNet.best_params['n_epochs']
learning_rate=study_ResNet.best_params['learning_rate']
optimizer=torch.optim.Adam(ResNet_model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()
loss_Adam=[]

for i in range(n_epochs):
    # making a pridiction in forward pass
    y_train_hat = ResNet_model(torch.Tensor(X_train.values)).reshape(-1,)
    # calculating the loss between original and predicted data points
    loss = criterion(y_train_hat, torch.Tensor(y_train.values))
    # store loss into list
    loss_Adam.append(loss.item())
    # zeroing gradients after each iteration
    optimizer.zero_grad()
    # backward pass for computing the gradients of the loss w.r.t to learnable parameters
    loss.backward()
    # updating the parameters after each iteration
    optimizer.step()

# Point prediction
y_test_hat_ResNet = (ResNet_model(torch.Tensor(X_test.values)).reshape(-1,)).detach().numpy()

# Estimate standard deviation of the prediction error
std_dev_error = np.std(y_test - y_test_hat_ResNet)

# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_ResNet[i], sig=std_dev_error) for i in range(len(y_test_hat_ResNet))]

# Calculate the mean CRPS
crps_ResNet = np.mean(crps_values)

print("CRPS ResNet: ", crps_ResNet)

RMSE ResNet:  28.044754


#### FFTransformer

In [None]:
N_TRIALS=5

d_out = 1  
d_in=X_train_.shape[1]

def FTTrans_opt(trial):

    seed=10
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block_multiplier = trial.suggest_int("d_block_multiplier", 1, 25)
    attention_n_heads = trial.suggest_int("attention_n_heads", 1, 20)
    attention_dropout = trial.suggest_float("attention_dropout", 0, 1)
    ffn_d_hidden_multiplier=trial.suggest_float("ffn_d_hidden_multiplier", 0.5, 3)
    ffn_dropout = trial.suggest_float("ffn_dropout", 0, 1)
    residual_dropout = trial.suggest_float("residual_dropout", 0, 1)

    FTTrans_model = FTTransformer(
    n_cont_features=d_in,
    cat_cardinalities=[],
    d_out=d_out,
    n_blocks=n_blocks,
    d_block=d_block_multiplier*attention_n_heads,
    attention_n_heads=attention_n_heads,
    attention_dropout=attention_dropout,
    ffn_d_hidden=None,
    ffn_d_hidden_multiplier=ffn_d_hidden_multiplier,
    ffn_dropout=ffn_dropout,
    residual_dropout=residual_dropout,
    )

    n_epochs=trial.suggest_int('n_epochs', 100, 5000)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    optimizer=torch.optim.Adam(FTTrans_model.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()
    loss_Adam=[]

    for i in range(n_epochs):
        # making a pridiction in forward pass
        y_train_hat = FTTrans_model(torch.Tensor(X_train_.values),None).reshape(-1,)
        # calculating the loss between original and predicted data points
        loss = criterion(y_train_hat, torch.Tensor(y_train_.values))
        # store loss into list
        loss_Adam.append(loss.item())
        # zeroing gradients after each iteration
        optimizer.zero_grad()
        # backward pass for computing the gradients of the loss w.r.t to learnable parameters
        loss.backward()
        # updating the parameters after each iteration
        optimizer.step()

    # Point prediction
    y_val_hat_FTTrans = (FTTrans_model(torch.Tensor(X_val.values), None).reshape(-1,)).detach().numpy()

    # Estimate standard deviation of the prediction error
    std_dev_error = np.std(y_val - y_val_hat_FTTrans)

    # Calculate the CRPS for each prediction
    y_val_np = y_val.values.flatten()
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_FTTrans[i], sig=std_dev_error) for i in range(len(y_val_hat_FTTrans))]

    # Calculate the mean CRPS
    crps_FTTrans= np.mean(crps_values)

    return crps_FTTrans

sampler_FTTrans = optuna.samplers.TPESampler(seed=10)
study_FTTrans = optuna.create_study(sampler=sampler_FTTrans, direction='minimize')
study_FTTrans.optimize(FTTrans_opt, n_trials=N_TRIALS)

[I 2024-01-08 14:46:52,555] A new study created in memory with name: no-name-7acaca2b-54b2-449c-8afa-679aaa0aa628


In [None]:
seed=10
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

FTTrans_model = FTTransformer(
    n_cont_features=d_in,
    cat_cardinalities=[],
    d_out=d_out,
    n_blocks=study_FTTrans.best_params['n_blocks'],
    d_block=study_FTTrans.best_params['d_block'],
    attention_n_heads=study_FTTrans.best_params['attention_n_heads'],
    attention_dropout=study_FTTrans.best_params['attention_dropout'],
    ffn_d_hidden=None,
    ffn_d_hidden_multiplier=study_FTTrans.best_params['ffn_d_hidden_multiplier'],
    ffn_dropout=study_FTTrans.best_params['ffn_dropout'],
    residual_dropout=study_FTTrans.best_params['residual_dropout'],
    )
n_epochs=study_FTTrans.best_params['n_epochs']
learning_rate=study_FTTrans.best_params['learning_rate']
optimizer=torch.optim.Adam(FTTrans_model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()
loss_Adam=[]

for i in range(n_epochs):
    # making a pridiction in forward pass
    y_train_hat = FTTrans_model(torch.Tensor(X_train.values), None).reshape(-1,)
    # calculating the loss between original and predicted data points
    loss = criterion(y_train_hat, torch.Tensor(y_train.values))
    # store loss into list
    loss_Adam.append(loss.item())
    # zeroing gradients after each iteration
    optimizer.zero_grad()
    # backward pass for computing the gradients of the loss w.r.t to learnable parameters
    loss.backward()
    # updating the parameters after each iteration
    optimizer.step()

# Point prediction
y_test_hat_FTTrans = (FTTrans_model(torch.Tensor(X_val.values), None).reshape(-1,)).detach().numpy()

# Estimate standard deviation of the prediction error
std_dev_error = np.std(y_test - y_test_hat_FTTrans)

# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_FTTrans[i], sig=std_dev_error) for i in range(len(y_test_hat_FTTrans))]

# Calculate the mean CRPS
crps_FTTrans= np.mean(crps_values)

print("CRPS FTTrans: ", crps_FTTrans)

In [90]:
import lightgbmlss
from properscoring import crps_gaussian

N_TRIALS=2

def boosted(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
    }

    # Use LightGBMLossGuideRegressor for distributional prediction
    boosted_tree_model = lightgbmlss.LightGBMLSS(**params)
    boosted_tree_model.fit(X_train_, y_train_)

    # Predict both the mean and standard deviation
    y_val_hat_boost, y_val_hat_std = boosted_tree_model.predict(X_val)

    # Calculate the CRPS for each prediction
    crps_values = [crps_gaussian(y_val[i], mu=y_val_hat_boost[i], sig=y_val_hat_std[i]) for i in range(len(y_val))]

    # Return the mean CRPS as the objective to be minimized
    return np.mean(crps_values)


sampler_boost = optuna.samplers.TPESampler(seed=10)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)

[I 2024-01-22 18:56:20,656] A new study created in memory with name: no-name-9300278e-a3ee-4e92-a210-7fcfd2356752
[W 2024-01-22 18:56:20,663] Trial 0 failed with parameters: {'learning_rate': 0.12071779104534666, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55} because of the following error: AttributeError("module 'lightgbmlss' has no attribute 'LightGBMLSS'").
Traceback (most recent call last):
  File "c:\Users\dalma\Desktop\THESIS_ETH_NEW\CODE\.venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\dalma\AppData\Local\Temp\ipykernel_14344\280956561.py", line 16, in boosted
    boosted_tree_model = lightgbmlss.LightGBMLSS(**params)
AttributeError: module 'lightgbmlss' has no attribute 'LightGBMLSS'
[W 2024-01-22 18:56:20,666] Trial 0 failed with value None.


AttributeError: module 'lightgbmlss' has no attribute 'LightGBMLSS'

#### Boosted trees, random forest, engression, linear regression

In [182]:
N_TRIALS=5

def boosted(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
              'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    boosted_tree_model=lgbm.LGBMRegressor(**params)
    boosted_tree_model.fit(X_train_, y_train_)
    y_val_hat_boost=boosted_tree_model.predict(X_val)
    RMSE_boost=np.sqrt(np.mean((y_val-y_val_hat_boost)**2))

    return RMSE_boost

sampler_boost = optuna.samplers.TPESampler(seed=10)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)

boosted_model=lgbm.LGBMRegressor(**study_boost.best_params)

def rf(trial):

    params = {'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'max_features': trial.suggest_int('max_features', 1, 30),
              'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100)}
    
    rf_model=RandomForestRegressor(**params)
    rf_model.fit(X_train_, y_train_)
    y_val_hat_rf=rf_model.predict(X_val)
    # Calculate the standard deviation of the residuals
    std_dev = np.std(y_val - y_val_hat_rf)
    # Calculate the CRPS for each prediction
    y_val_np = y_val.values.flatten()
    crps_values = [crps_gaussian(y_val_np[i], mu=y_val_hat_rf[i], sig=std_dev) for i in range(len(y_val_np))]
    CRPS_rf = np.mean(crps_values)

    return CRPS_rf

sampler_rf = optuna.samplers.TPESampler(seed=10)
study_rf = optuna.create_study(sampler=sampler_rf, direction='minimize')
study_rf.optimize(rf, n_trials=N_TRIALS)

rf_model=RandomForestRegressor(**study_rf.best_params)

N_SAMPLES=100
np.random.seed(10)
def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
              'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
              'num_layer': trial.suggest_int('num_layer', 2, 5),
              'hidden_dim': trial.suggest_int('hidden_dim', 50, 100),
              'noise_dim': trial.suggest_int('noise_dim', 50, 100),}
    
    engressor_model=engression(torch.Tensor(np.array(X_train_)), torch.Tensor(np.array(y_train_).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
    # Generate a sample from the engression model for each data point
    y_val_hat_engression_samples = [engressor_model.predict(torch.Tensor(np.array([X_val.values[i]])), target=list(np.random.uniform(0,1,N_SAMPLES))) for i in range(len(X_val))]

    # Calculate the CRPS for each prediction
    y_val_np = y_val.values.flatten()
    crps_values = [crps_ensemble(y_val_np[i], np.array(y_val_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_val_np))]

    return np.mean(crps_values)

sampler_engression = optuna.samplers.TPESampler(seed=10)
study_engression = optuna.create_study(sampler=sampler_engression, direction='minimize')
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)

[I 2024-01-22 21:04:07,671] A new study created in memory with name: no-name-efacb2a7-d010-45d6-a7da-2763ca786a0e


[I 2024-01-22 21:04:08,030] Trial 0 finished with value: 18.851961717752452 and parameters: {'learning_rate': 0.12071779104534666, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55}. Best is trial 0 with value: 18.851961717752452.
[I 2024-01-22 21:04:08,242] Trial 1 finished with value: 21.13441405830955 and parameters: {'learning_rate': 0.004043145805966843, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 0 with value: 18.851961717752452.
[I 2024-01-22 21:04:08,748] Trial 2 finished with value: 18.918708311325076 and parameters: {'learning_rate': 0.07075637776590661, 'n_estimators': 482, 'reg_lambda': 1.08526150100961e-08, 'max_depth': 16, 'min_child_samples': 83}. Best is trial 0 with value: 18.851961717752452.
[I 2024-01-22 21:04:09,156] Trial 3 finished with value: 18.944544861898308 and parameters: {'learning_rate': 0.044997613517186334, 'n_estimators': 389, 'reg_lambda':

Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.8660,  E(|Y-Yhat|): 1.1726,  E(|Yhat-Yhat'|): 0.6133
[Epoch 100 (84%), batch 6] energy-loss: 0.2349,  E(|Y-Yhat|): 0.4827,  E(|Yhat-Yhat'|): 0.4956

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.5087,  E(|Y-Yhat|): 2.2914,  E(|Yhat-Yhat'|): 1.5654

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-01-22 21:05:36,580] Trial 0 finished with value: 4.915328465896906 and parameters: {'learning_rate': 0.12071779104534666, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 88, 'noise_dim': 75}. Best is trial 0 with value: 4.915328465896906.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.3952,  E(|Y-Yhat|): 0.7716,  E(|Yhat-Yhat'|): 0.7529
[Epoch 100 (36%), batch 6] energy-loss: 0.2185,  E(|Y-Yhat|): 0.4328,  E(|Yhat-Yhat'|): 0.4285
[Epoch 200 (72%), batch 6] energy-loss: 0.1959,  E(|Y-Yhat|): 0.4093,  E(|Yhat-Yhat'|): 0.4267

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.3192,  E(|Y-Yhat|): 2.5551,  E(|Yhat-Yhat'|): 2.4718

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a

[I 2024-01-22 21:07:03,442] Trial 1 finished with value: 4.974529882222395 and parameters: {'learning_rate': 0.004043145805966843, 'num_epoches': 278, 'num_layer': 5, 'hidden_dim': 58, 'noise_dim': 54}. Best is trial 0 with value: 4.915328465896906.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.5086,  E(|Y-Yhat|): 0.7961,  E(|Yhat-Yhat'|): 0.5750
[Epoch 100 (10%), batch 6] energy-loss: 0.2307,  E(|Y-Yhat|): 0.4512,  E(|Yhat-Yhat'|): 0.4409
[Epoch 200 (21%), batch 6] energy-loss: 0.2154,  E(|Y-Yhat|): 0.4430,  E(|Yhat-Yhat'|): 0.4552
[Epoch 300 (31%), batch 6] energy-loss: 0.2142,  E(|Y-Yhat|): 0.4360,  E(|Yhat-Yhat'|): 0.4436
[Epoch 400 (42%), batch 6] energy-loss: 0.2072,  E(|Y-Yhat|): 0.4318,  E(|Yhat-Yhat'|): 0.4491
[Epoch 500 (52%), batch 6] energy-loss: 0.2092,  E(|Y-Yhat|): 0.4236,  E(|Yhat-Yhat'|): 0.4288
[Epoch 600 (62%), batch 6] energy-loss: 0.1923,  E(|Y-Yhat|): 0.4103,  E(|Yhat-Yhat'|): 0.4359
[Epoch 700 (73%), batch 6] energy-loss: 0.

[I 2024-01-22 21:09:16,268] Trial 2 finished with value: 13.737828458648657 and parameters: {'learning_rate': 0.07075637776590661, 'num_epoches': 959, 'num_layer': 2, 'hidden_dim': 76, 'noise_dim': 91}. Best is trial 0 with value: 4.915328465896906.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.5964,  E(|Y-Yhat|): 1.1647,  E(|Yhat-Yhat'|): 1.1367
[Epoch 100 (13%), batch 6] energy-loss: 0.2197,  E(|Y-Yhat|): 0.4295,  E(|Yhat-Yhat'|): 0.4195
[Epoch 200 (27%), batch 6] energy-loss: 0.2073,  E(|Y-Yhat|): 0.4470,  E(|Yhat-Yhat'|): 0.4793
[Epoch 300 (40%), batch 6] energy-loss: 0.1971,  E(|Y-Yhat|): 0.4186,  E(|Yhat-Yhat'|): 0.4431
[Epoch 400 (53%), batch 6] energy-loss: 0.1847,  E(|Y-Yhat|): 0.4081,  E(|Yhat-Yhat'|): 0.4470
[Epoch 500 (67%), batch 6] energy-loss: 0.1961,  E(|Y-Yhat|): 0.3958,  E(|Yhat-Yhat'|): 0.3994
[Epoch 600 (80%

[I 2024-01-22 21:11:38,688] Trial 3 finished with value: 12.374831756211957 and parameters: {'learning_rate': 0.044997613517186334, 'num_epoches': 750, 'num_layer': 3, 'hidden_dim': 96, 'noise_dim': 86}. Best is trial 0 with value: 4.915328465896906.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 6] energy-loss: 0.3035,  E(|Y-Yhat|): 0.5615,  E(|Yhat-Yhat'|): 0.5158
[Epoch 100 (43%), batch 6] energy-loss: 0.2047,  E(|Y-Yhat|): 0.4474,  E(|Yhat-Yhat'|): 0.4855
[Epoch 200 (87%), batch 6] energy-loss: 0.2179,  E(|Y-Yhat|): 0.4464,  E(|Yhat-Yhat'|): 0.4568

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.0209,  E(|Y-Yhat|): 2.1303,  E(|Yhat-Yhat'|): 2.2189

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a

[I 2024-01-22 21:12:38,496] Trial 4 finished with value: 7.3621405925077 and parameters: {'learning_rate': 0.029128020748551788, 'num_epoches': 228, 'num_layer': 3, 'hidden_dim': 84, 'noise_dim': 72}. Best is trial 0 with value: 4.915328465896906.


In [185]:
boosted_model.fit(X_train, y_train)
y_test_hat_boosted=boosted_model.predict(X_test)
RMSE_boosted=np.sqrt(np.mean((y_test-y_test_hat_boosted)**2))

rf_model.fit(X_train, y_train)
y_test_hat_rf=rf_model.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_rf)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_rf[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_rf = np.mean(crps_values)

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_linreg)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_linreg[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_linreg = np.mean(crps_values)

params=study_engression.best_params
engressor_model=engression(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
N_SAMPLES=100
np.random.seed(10)
# Generate a sample from the engression model for each data point
y_test_hat_engression_samples = [engressor_model.predict(torch.Tensor(np.array([X_test.values[i]])), target=list(np.random.uniform(0,1,N_SAMPLES))) for i in range(len(X_test))]
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_ensemble(y_test_np[i], np.array(y_test_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_test_np))]
CRPS_engression=np.mean(crps_values)


print("CRPS linear regression: ",CRPS_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("CRPS random forest", CRPS_rf)
print("CRPS engression", CRPS_engression)

Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 7] energy-loss: 0.5162,  E(|Y-Yhat|): 0.7144,  E(|Yhat-Yhat'|): 0.3962
[Epoch 100 (84%), batch 7] energy-loss: 0.1096,  E(|Y-Yhat|): 0.2499,  E(|Yhat-Yhat'|): 0.2806

Training loss on the original (non-standardized) scale:
	Energy-loss: 1.7721,  E(|Y-Yhat|): 3.6974,  E(|Yhat-Yhat'|): 3.8507

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 
CRPS linear regression:  30.33599373187919
RMSE boosted trees 5.043509012980777
CRPS

In [186]:
boosted_model.fit(X_train, y_train)
y_test_hat_boosted=boosted_model.predict(X_test)
RMSE_boosted=np.sqrt(np.mean((y_test-y_test_hat_boosted)**2))

rf_model.fit(X_train, y_train)
y_test_hat_rf=rf_model.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_rf)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_rf[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_rf = np.mean(crps_values)

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_linreg)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_linreg[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_linreg = np.mean(crps_values)

params=study_engression.best_params
engressor_model=engression(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
N_SAMPLES=100
np.random.seed(10)
# Generate a sample from the engression model for each data point
y_test_hat_engression_samples = [engressor_model.predict(torch.Tensor(np.array([X_test.values[i]])), target=list(np.random.uniform(0,1,N_SAMPLES))) for i in range(len(X_test))]
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_ensemble(y_test_np[i], np.array(y_test_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_test_np))]
CRPS_engression=np.mean(crps_values)


print("CRPS linear regression: ",CRPS_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("CRPS random forest", CRPS_rf)
print("CRPS engression", CRPS_engression)

Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 7] energy-loss: 0.7340,  E(|Y-Yhat|): 1.1417,  E(|Yhat-Yhat'|): 0.8153
[Epoch 100 (84%), batch 7] energy-loss: 0.1448,  E(|Y-Yhat|): 0.3051,  E(|Yhat-Yhat'|): 0.3206

Training loss on the original (non-standardized) scale:
	Energy-loss: 7.6452,  E(|Y-Yhat|): 8.9416,  E(|Yhat-Yhat'|): 2.5929

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 
CRPS linear regression:  30.33599373187919
RMSE boosted trees 5.043509012980777
CRPS

In [193]:
boosted_model.fit(X_train, y_train)
y_test_hat_boosted=boosted_model.predict(X_test)
RMSE_boosted=np.sqrt(np.mean((y_test-y_test_hat_boosted)**2))

rf_model.fit(X_train, y_train)
y_test_hat_rf=rf_model.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_rf)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_rf[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_rf = np.mean(crps_values)

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
# Calculate the standard deviation of the residuals
std_dev = np.std(y_test - y_test_hat_linreg)
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_gaussian(y_test_np[i], mu=y_test_hat_linreg[i], sig=std_dev) for i in range(len(y_test_np))]
CRPS_linreg = np.mean(crps_values)

params=study_engression.best_params
engressor_model=engression(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train).reshape(-1,1)), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
N_SAMPLES=100
seed=10
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
# Generate a sample from the engression model for each data point
y_test_hat_engression_samples = [engressor_model.predict(torch.Tensor(np.array([X_test.values[i]])), target=list(np.random.uniform(0,1,N_SAMPLES))) for i in range(len(X_test))]
# Calculate the CRPS for each prediction
y_test_np = y_test.values.flatten()
crps_values = [crps_ensemble(y_test_np[i], np.array(y_test_hat_engression_samples[i]).reshape(-1,)) for i in range(len(y_test_np))]
CRPS_engression=np.mean(crps_values)


print("CRPS linear regression: ",CRPS_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("CRPS random forest", CRPS_rf)
print("CRPS engression", CRPS_engression)

Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 7] energy-loss: 0.6452,  E(|Y-Yhat|): 1.0007,  E(|Yhat-Yhat'|): 0.7110
[Epoch 100 (84%), batch 7] energy-loss: 0.1494,  E(|Y-Yhat|): 0.2912,  E(|Yhat-Yhat'|): 0.2835

Training loss on the original (non-standardized) scale:
	Energy-loss: 2.5185,  E(|Y-Yhat|): 3.2513,  E(|Yhat-Yhat'|): 1.4656

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 
CRPS linear regression:  30.33599373187919
RMSE boosted trees 5.043509012980777
CRPS