In [35]:

import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LinearGAM, s, f
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score

#SUITE_ID = 336 # Regression on numerical features
SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

In [36]:
len(benchmark_suite.tasks)

16

In [37]:
'''for task_id in benchmark_suite.tasks:
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)
    print(y.value_counts())'''

'for task_id in benchmark_suite.tasks:\n    task = openml.tasks.get_task(task_id)\n    dataset = task.get_dataset()\n    X, y, categorical_indicator, attribute_names = dataset.get_data(\n        dataset_format="dataframe", target=dataset.default_target_attribute)\n    print(y.value_counts())'

In [38]:
y.dtype

dtype('uint8')

In [39]:
task_id=361055
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
y=y.astype('int')

# Set the random seed for reproducibility
N_TRIALS=5
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)

# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X.values]

mahalanobis_dist=pd.Series(mahalanobis_dist,index=X.index)
far_index=mahalanobis_dist.index[np.where(mahalanobis_dist>=np.quantile(mahalanobis_dist,0.8))[0]]
close_index=mahalanobis_dist.index[np.where(mahalanobis_dist<np.quantile(mahalanobis_dist,0.8))[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

mean = np.mean(X_train, axis=0)
cov = np.cov(X_train.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist_ = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X_train.values]

mahalanobis_dist_=pd.Series(mahalanobis_dist_,index=X_train.index)
far_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_>=np.quantile(mahalanobis_dist_,0.8))[0]]
close_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_<np.quantile(mahalanobis_dist_,0.8))[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()



In [42]:
# #### Define train function
def train(model,criterion,loss_Adam,optimizer,training_iterations,X_train_tensor,y_train_tensor):
    iterator = tqdm.tqdm(range(training_iterations), desc="Train")

    for _ in iterator:
        # making a pridiction in forward pass
        y_train_hat = model(X_train_tensor).reshape(-1,)
        # calculating the loss between original and predicted data points
        loss = criterion(y_train_hat, torch.Tensor(y_train_tensor))
        # store loss into list
        loss_Adam.append(loss.item())
        # zeroing gradients after each iteration
        optimizer.zero_grad()
        # backward pass for computing the gradients of the loss w.r.t to learnable parameters
        loss.backward()
        # updating the parameters after each iteration
        optimizer.step()
        iterator.set_postfix(loss=loss.item())
        torch.cuda.empty_cache()

# #### MLP
d_out = 1  
d_in=X_train_.shape[1]

def MLP_opt(trial):

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block = trial.suggest_int("d_block", 10, 500)
    dropout = trial.suggest_float("dropout", 0, 1)

    MLP_model = MLP(
    d_in=d_in,
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=n_blocks,
    d_block=d_block,
    dropout=dropout,
    )
    n_epochs=trial.suggest_int('n_epochs', 1, 100)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    weight_decay=trial.suggest_float('weight_decay', 1e-8, 1e-3, log=True)
    optimizer=torch.optim.Adam(MLP_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
    loss_Adam=[]

    if torch.cuda.is_available():
        MLP_model = MLP_model.cuda()
    
    train(MLP_model,criterion,loss_Adam,optimizer,n_epochs,X_train__tensor,y_train__tensor)

    # Point prediction
    y_val_hat_MLP = torch.sigmoid(MLP_model(X_val_tensor).reshape(-1,))  # Apply sigmoid to get probabilities
    accuracy_MLP = accuracy_score(y_val_tensor.cpu().numpy(), y_val_hat_MLP.ge(0.5).float().cpu().numpy())  # Calculate accuracy

    return accuracy_MLP

sampler_MLP = optuna.samplers.TPESampler(seed=seed)
study_MLP = optuna.create_study(sampler=sampler_MLP, direction='maximize')  # We want to maximize accuracy
study_MLP.optimize(MLP_opt, n_trials=N_TRIALS)

MLP_model = MLP(
    d_in=d_in,
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=study_MLP.best_params['n_blocks'],
    d_block=study_MLP.best_params['d_block'],
    dropout=study_MLP.best_params['dropout'],
    )

if torch.cuda.is_available():
    MLP_model = MLP_model.cuda()
    
n_epochs=study_MLP.best_params['n_epochs']
learning_rate=study_MLP.best_params['learning_rate']
weight_decay=study_MLP.best_params['weight_decay']
optimizer=torch.optim.Adam(MLP_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
loss_Adam=[]

train(MLP_model,criterion,loss_Adam,optimizer,n_epochs,X_train_tensor,y_train_tensor)

# Point prediction
y_test_hat_MLP = torch.sigmoid(MLP_model(X_test_tensor).reshape(-1,))  # Apply sigmoid to get probabilities
accuracy_MLP = accuracy_score(y_test_tensor.cpu().numpy(), y_test_hat_MLP.ge(0.5).float().cpu().numpy())  # Calculate accuracy
print("Accuracy MLP: ", accuracy_MLP)

[I 2024-01-30 18:40:26,318] A new study created in memory with name: no-name-b3ff4b09-1560-464e-827d-d5c557220d80


Train:   0%|          | 0/75 [00:00<?, ?it/s]

[I 2024-01-30 18:40:27,866] Trial 0 finished with value: 0.5237383177570093 and parameters: {'n_blocks': 4, 'd_block': 20, 'dropout': 0.6336482349262754, 'n_epochs': 75, 'learning_rate': 0.002215416944953109, 'weight_decay': 1.33040303714882e-07}. Best is trial 0 with value: 0.5237383177570093.


Train:   0%|          | 0/9 [00:00<?, ?it/s]

[I 2024-01-30 18:40:28,378] Trial 1 finished with value: 0.3024299065420561 and parameters: {'n_blocks': 1, 'd_block': 383, 'dropout': 0.16911083656253545, 'n_epochs': 9, 'learning_rate': 0.007075637776590665, 'weight_decay': 0.0005847452881552242}. Best is trial 0 with value: 0.5237383177570093.


Train:   0%|          | 0/62 [00:00<?, ?it/s]

[I 2024-01-30 18:40:30,433] Trial 2 finished with value: 0.2927102803738318 and parameters: {'n_blocks': 1, 'd_block': 261, 'dropout': 0.8126209616521135, 'n_epochs': 62, 'learning_rate': 0.008871477434617912, 'weight_decay': 2.879919449586155e-07}. Best is trial 0 with value: 0.5237383177570093.


Train:   0%|          | 0/15 [00:00<?, ?it/s]

[I 2024-01-30 18:40:38,895] Trial 3 finished with value: 0.4033644859813084 and parameters: {'n_blocks': 5, 'd_block': 360, 'dropout': 0.5425443680112613, 'n_epochs': 15, 'learning_rate': 0.0010177368807699995, 'weight_decay': 2.3478377182859888e-05}. Best is trial 0 with value: 0.5237383177570093.


Train:   0%|          | 0/52 [00:00<?, ?it/s]

[I 2024-01-30 18:40:47,405] Trial 4 finished with value: 0.29233644859813085 and parameters: {'n_blocks': 3, 'd_block': 223, 'dropout': 0.6177669784693172, 'n_epochs': 52, 'learning_rate': 0.005693803629695728, 'weight_decay': 1.0120332166548561e-05}. Best is trial 0 with value: 0.5237383177570093.


Train:   0%|          | 0/75 [00:00<?, ?it/s]

Accuracy MLP:  0.5770266227938977


In [43]:
d_out = 1  
d_in=X_train_.shape[1]

def ResNet_opt(trial):

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block = trial.suggest_int("d_block", 10, 500)
    dropout1 = trial.suggest_float("dropout1", 0, 1)
    dropout2 = trial.suggest_float("dropout2", 0, 1)
    d_hidden_multiplier=trial.suggest_float("d_hidden_multiplier", 0.5, 3)

    ResNet_model = ResNet(
    d_in=d_in,
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=n_blocks,
    d_block=d_block,
    d_hidden=None,
    d_hidden_multiplier=d_hidden_multiplier,
    dropout1=dropout1,
    dropout2=dropout2,
    )
    if torch.cuda.is_available():
        ResNet_model = ResNet_model.cuda()
    n_epochs=trial.suggest_int('n_epochs', 1, 100)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    weight_decay=trial.suggest_float('weight_decay', 1e-8, 1e-3, log=True)
    optimizer=torch.optim.Adam(ResNet_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
    loss_Adam=[]

    train(ResNet_model,criterion,loss_Adam,optimizer,n_epochs,X_train__tensor,y_train__tensor)

    # Point prediction
    y_val_hat_ResNet = torch.sigmoid(ResNet_model(X_val_tensor).reshape(-1,))  # Apply sigmoid to get probabilities
    accuracy_ResNet = accuracy_score(y_val_tensor.cpu().numpy(), y_val_hat_ResNet.ge(0.5).float().cpu().numpy())  # Calculate accuracy

    return accuracy_ResNet

sampler_ResNet = optuna.samplers.TPESampler(seed=seed)
study_ResNet = optuna.create_study(sampler=sampler_ResNet, direction='maximize')  # We want to maximize accuracy
study_ResNet.optimize(ResNet_opt, n_trials=N_TRIALS)

ResNet_model = ResNet(
    d_in=d_in,
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=study_ResNet.best_params['n_blocks'],
    d_block=study_ResNet.best_params['d_block'],
    d_hidden=None,
    d_hidden_multiplier=study_ResNet.best_params['d_hidden_multiplier'],
    dropout1=study_ResNet.best_params['dropout1'],
    dropout2=study_ResNet.best_params['dropout2'],
    )

if torch.cuda.is_available():
    ResNet_model = ResNet_model.cuda()

n_epochs=study_ResNet.best_params['n_epochs']
learning_rate=study_ResNet.best_params['learning_rate']
weight_decay=study_ResNet.best_params['weight_decay']
optimizer=torch.optim.Adam(ResNet_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
loss_Adam=[]

train(ResNet_model,criterion,loss_Adam,optimizer,n_epochs,X_train_tensor,y_train_tensor)

# Point prediction
y_test_hat_ResNet = torch.sigmoid(ResNet_model(X_test_tensor).reshape(-1,))  # Apply sigmoid to get probabilities
accuracy_ResNet = accuracy_score(y_test_tensor.cpu().numpy(), y_test_hat_ResNet.ge(0.5).float().cpu().numpy())  # Calculate accuracy
print("Accuracy ResNet: ", accuracy_ResNet)

[I 2024-01-30 18:41:35,770] A new study created in memory with name: no-name-33f8520b-de15-43c3-98c6-49a88fa307ad


Train:   0%|          | 0/23 [00:00<?, ?it/s]

[I 2024-01-30 18:41:37,102] Trial 0 finished with value: 0.4672897196261682 and parameters: {'n_blocks': 4, 'd_block': 20, 'dropout1': 0.6336482349262754, 'dropout2': 0.7488038825386119, 'd_hidden_multiplier': 1.7462675307564761, 'n_epochs': 23, 'learning_rate': 0.000342425210145967, 'weight_decay': 6.348243270946383e-05}. Best is trial 0 with value: 0.4672897196261682.


Train:   0%|          | 0/52 [00:00<?, ?it/s]

[I 2024-01-30 18:41:38,381] Trial 1 finished with value: 0.5364485981308411 and parameters: {'n_blocks': 1, 'd_block': 53, 'dropout1': 0.6853598183677972, 'dropout2': 0.9533933461949365, 'd_hidden_multiplier': 0.5098706658197861, 'n_epochs': 52, 'learning_rate': 0.015604131457893529, 'weight_decay': 1.155128593079557e-05}. Best is trial 1 with value: 0.5364485981308411.


Train:   0%|          | 0/15 [00:00<?, ?it/s]

[I 2024-01-30 18:41:45,953] Trial 2 finished with value: 0.2930841121495327 and parameters: {'n_blocks': 4, 'd_block': 153, 'dropout1': 0.9177741225129434, 'dropout2': 0.7145757833976906, 'd_hidden_multiplier': 1.8563609200281532, 'n_epochs': 15, 'learning_rate': 0.0010177368807699995, 'weight_decay': 2.3478377182859888e-05}. Best is trial 1 with value: 0.5364485981308411.


Train:   0%|          | 0/61 [00:00<?, ?it/s]

[I 2024-01-30 18:42:31,859] Trial 3 finished with value: 0.4291588785046729 and parameters: {'n_blocks': 3, 'd_block': 223, 'dropout1': 0.6177669784693172, 'dropout2': 0.5131382425543909, 'd_hidden_multiplier': 2.125992954828668, 'n_epochs': 61, 'learning_rate': 0.014902984681415245, 'weight_decay': 4.057287303826027e-06}. Best is trial 1 with value: 0.5364485981308411.


Train:   0%|          | 0/83 [00:00<?, ?it/s]

[I 2024-01-30 18:43:03,974] Trial 4 finished with value: 0.2930841121495327 and parameters: {'n_blocks': 5, 'd_block': 166, 'dropout1': 0.09045934927090737, 'dropout2': 0.30070005663620336, 'd_hidden_multiplier': 0.7849609046588744, 'n_epochs': 83, 'learning_rate': 0.00013383563361780206, 'weight_decay': 1.3534298216580227e-05}. Best is trial 1 with value: 0.5364485981308411.


Train:   0%|          | 0/52 [00:00<?, ?it/s]

Accuracy ResNet:  0.3104995513012264


In [44]:
# #### FFTransformer

def train_trans(model,criterion,loss_Adam,optimizer,training_iterations,X_train_tensor,y_train_tensor):
    iterator = tqdm.tqdm(range(training_iterations), desc="Train")

    for _ in iterator:
        # making a pridiction in forward pass
        y_train_hat = model(X_train_tensor, None).reshape(-1,)
        # calculating the loss between original and predicted data points
        loss = criterion(y_train_hat, torch.Tensor(y_train_tensor))
        # store loss into list
        loss_Adam.append(loss.item())
        # zeroing gradients after each iteration
        optimizer.zero_grad()
        # backward pass for computing the gradients of the loss w.r.t to learnable parameters
        loss.backward()
        # updating the parameters after each iteration
        optimizer.step()
        iterator.set_postfix(loss=loss.item())
        torch.cuda.empty_cache()

d_out = 1  
d_in=X_train_.shape[1]

def FTTrans_opt(trial):

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    n_blocks = trial.suggest_int("n_blocks", 1, 5)
    d_block_multiplier = trial.suggest_int("d_block_multiplier", 1, 25)
    attention_n_heads = trial.suggest_int("attention_n_heads", 1, 20)
    attention_dropout = trial.suggest_float("attention_dropout", 0, 1)
    ffn_d_hidden_multiplier=trial.suggest_float("ffn_d_hidden_multiplier", 0.5, 3)
    ffn_dropout = trial.suggest_float("ffn_dropout", 0, 1)
    residual_dropout = trial.suggest_float("residual_dropout", 0, 1)

    FTTrans_model = FTTransformer(
    n_cont_features=d_in,
    cat_cardinalities=[],
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=n_blocks,
    d_block=d_block_multiplier*attention_n_heads,
    attention_n_heads=attention_n_heads,
    attention_dropout=attention_dropout,
    ffn_d_hidden=None,
    ffn_d_hidden_multiplier=ffn_d_hidden_multiplier,
    ffn_dropout=ffn_dropout,
    residual_dropout=residual_dropout,
    )

    if torch.cuda.is_available():
        FTTrans_model = FTTrans_model.cuda()

    n_epochs=trial.suggest_int('n_epochs', 1, 3)
    learning_rate=trial.suggest_float('learning_rate', 0.0001, 0.05, log=True)
    weight_decay=trial.suggest_float('weight_decay', 1e-8, 1e-3, log=True)
    optimizer=torch.optim.Adam(FTTrans_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
    loss_Adam=[]

    train_trans(FTTrans_model,criterion,loss_Adam,optimizer,n_epochs,X_train__tensor,y_train__tensor)

    # Point prediction
    y_val_hat_FTTrans = torch.sigmoid(FTTrans_model(X_val_tensor, None).reshape(-1,))  # Apply sigmoid to get probabilities
    accuracy_FTTrans = accuracy_score(y_val_tensor.cpu().numpy(), y_val_hat_FTTrans.ge(0.5).float().cpu().numpy())  # Calculate accuracy

    return accuracy_FTTrans

sampler_FTTrans = optuna.samplers.TPESampler(seed=seed)
study_FTTrans = optuna.create_study(sampler=sampler_FTTrans, direction='maximize')  # We want to maximize accuracy
study_FTTrans.optimize(FTTrans_opt, n_trials=N_TRIALS)


FTTrans_model = FTTransformer(
    n_cont_features=d_in,
    cat_cardinalities=[],
    d_out=1,  # For binary classification, output dimension should be 1
    n_blocks=study_FTTrans.best_params['n_blocks'],
    d_block=study_FTTrans.best_params['d_block_multiplier']*study_FTTrans.best_params['attention_n_heads'],
    attention_n_heads=study_FTTrans.best_params['attention_n_heads'],
    attention_dropout=study_FTTrans.best_params['attention_dropout'],
    ffn_d_hidden=None,
    ffn_d_hidden_multiplier=study_FTTrans.best_params['ffn_d_hidden_multiplier'],
    ffn_dropout=study_FTTrans.best_params['ffn_dropout'],
    residual_dropout=study_FTTrans.best_params['residual_dropout'],
    )

if torch.cuda.is_available():
    FTTrans_model = FTTrans_model.cuda()

n_epochs=study_FTTrans.best_params['n_epochs']
learning_rate=study_FTTrans.best_params['learning_rate']
weight_decay=study_FTTrans.best_params['weight_decay']
optimizer=torch.optim.Adam(FTTrans_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.BCEWithLogitsLoss()  # Use Binary Cross Entropy loss for binary classification
loss_Adam=[]

train_trans(FTTrans_model,criterion,loss_Adam,optimizer,n_epochs,X_train_tensor,y_train_tensor)

# Point prediction
y_test_hat_FTTrans = torch.sigmoid(FTTrans_model(X_test_tensor, None).reshape(-1,))  # Apply sigmoid to get probabilities
accuracy_FTTrans = accuracy_score(y_test_tensor.cpu().numpy(), y_test_hat_FTTrans.ge(0.5).float().cpu().numpy())  # Calculate accuracy
print("Accuracy FTTrans: ", accuracy_FTTrans)

[I 2024-01-30 18:43:50,479] A new study created in memory with name: no-name-26b1d987-23fe-4e9e-870e-6f9839d4e3b3


Train:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2024-01-30 18:44:05,395] Trial 0 finished with value: 0.4654205607476635 and parameters: {'n_blocks': 4, 'd_block_multiplier': 1, 'attention_n_heads': 13, 'attention_dropout': 0.7488038825386119, 'ffn_d_hidden_multiplier': 1.7462675307564761, 'ffn_dropout': 0.22479664553084766, 'residual_dropout': 0.19806286475962398, 'n_epochs': 3, 'learning_rate': 0.0002860388842288948, 'weight_decay': 2.765025054332623e-08}. Best is trial 0 with value: 0.4654205607476635.


Train:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-01-30 18:44:07,731] Trial 1 finished with value: 0.29121495327102803 and parameters: {'n_blocks': 4, 'd_block_multiplier': 24, 'attention_n_heads': 1, 'attention_dropout': 0.5121922633857766, 'ffn_d_hidden_multiplier': 2.531552404130284, 'ffn_dropout': 0.6125260668293881, 'residual_dropout': 0.7217553174317995, 'n_epochs': 1, 'learning_rate': 0.029994721053560828, 'weight_decay': 3.7400629930578146e-05}. Best is trial 0 with value: 0.4654205607476635.


Train:   0%|          | 0/2 [00:00<?, ?it/s]

[I 2024-01-30 18:44:14,214] Trial 2 finished with value: 0.32560747663551404 and parameters: {'n_blocks': 3, 'd_block_multiplier': 4, 'attention_n_heads': 8, 'attention_dropout': 0.6741336150663453, 'ffn_d_hidden_multiplier': 1.6045829360574901, 'ffn_dropout': 0.4340139933332937, 'residual_dropout': 0.6177669784693172, 'n_epochs': 2, 'learning_rate': 0.005693803629695728, 'weight_decay': 1.0120332166548561e-05}. Best is trial 0 with value: 0.4654205607476635.


Train:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score

#SUITE_ID = 336 # Regression on numerical features
SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361055
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
y=y.astype('int')

# Set the random seed for reproducibility
N_TRIALS=2
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X.values]

mahalanobis_dist=pd.Series(mahalanobis_dist,index=X.index)
far_index=mahalanobis_dist.index[np.where(mahalanobis_dist>=np.quantile(mahalanobis_dist,0.8))[0]]
close_index=mahalanobis_dist.index[np.where(mahalanobis_dist<np.quantile(mahalanobis_dist,0.8))[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

mean = np.mean(X_train, axis=0)
cov = np.cov(X_train.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist_ = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X_train.values]

mahalanobis_dist_=pd.Series(mahalanobis_dist_,index=X_train.index)
far_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_>=np.quantile(mahalanobis_dist_,0.8))[0]]
close_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_<np.quantile(mahalanobis_dist_,0.8))[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()



In [17]:
def boosted(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
              'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    boosted_tree_model=lgbm.LGBMClassifier(**params)
    boosted_tree_model.fit(X_train_, y_train_)
    y_val_hat_boost=boosted_tree_model.predict(X_val)
    accuracy_boost=accuracy_score(y_val, y_val_hat_boost)

    return accuracy_boost

sampler_boost = optuna.samplers.TPESampler(seed=seed)
study_boost = optuna.create_study(sampler=sampler_boost, direction='maximize')
study_boost.optimize(boosted, n_trials=N_TRIALS)
boosted_model=lgbm.LGBMClassifier(**study_boost.best_params)

def rf(trial):

    params = {'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'max_features': trial.suggest_int('max_features', 1, 30),
              'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100)}
    
    rf_model=RandomForestClassifier(**params)
    rf_model.fit(X_train_, y_train_)
    y_val_hat_rf=rf_model.predict(X_val)
    accuracy_rf=accuracy_score(y_val, y_val_hat_rf)

    return accuracy_rf

sampler_rf = optuna.samplers.TPESampler(seed=seed)
study_rf = optuna.create_study(sampler=sampler_rf, direction='maximize')
study_rf.optimize(rf, n_trials=N_TRIALS)
rf_model=RandomForestClassifier(**study_rf.best_params)


# Fit the boosted model and make predictions
boosted_model.fit(X_train, y_train)
y_test_hat_boosted = boosted_model.predict(X_test)
accuracy_boosted = accuracy_score(y_test, y_test_hat_boosted)

# Fit the random forest model and make predictions
rf_model.fit(X_train, y_train)
y_test_hat_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_test_hat_rf)

# Fit the logistic regression model and make predictions
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_test_hat_logreg = log_reg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_test_hat_logreg)

print("Accuracy logistic regression: ", accuracy_logreg)
print("Accuracy boosted trees: ", accuracy_boosted)
print("Accuracy random forest: ", accuracy_rf)

# GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
              'lam': trial.suggest_loguniform('lam', 1e-3, 1)}

    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the accuracy
    y_val_hat_gam = gam.predict(X_val)
    accuracy_gam = accuracy_score(y_val, y_val_hat_gam)

    return accuracy_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='maximize')

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LogisticGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict(X_test)
# Calculate the accuracy
accuracy_gam = accuracy_score(y_test, y_test_hat_gam)
print("Accuracy GAM: ", accuracy_gam)

[I 2024-01-30 17:22:58,030] A new study created in memory with name: no-name-a9479560-c8b2-457d-9e60-17d267dba0d7
[I 2024-01-30 17:22:58,162] Trial 0 finished with value: 0.8160747663551402 and parameters: {'learning_rate': 0.12071779104534666, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55}. Best is trial 0 with value: 0.8160747663551402.


[LightGBM] [Info] Number of positive: 4080, number of negative: 6616
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 871
[LightGBM] [Info] Number of data points in the train set: 10696, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381451 -> initscore=-0.483394
[LightGBM] [Info] Start training from score -0.483394
[LightGBM] [Info] Number of positive: 4080, number of negative: 6616
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 871
[LightGBM] [Info] Number of data points in the train set: 10696, number of used features: 10
[LightGBM] [Info] [binar

[I 2024-01-30 17:22:58,310] Trial 1 finished with value: 0.7547663551401869 and parameters: {'learning_rate': 0.004043145805966843, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 0 with value: 0.8160747663551402.
[I 2024-01-30 17:22:58,312] A new study created in memory with name: no-name-eb52592e-93e1-4bea-9990-bab4c9266d7b




[I 2024-01-30 17:23:00,957] Trial 0 finished with value: 0.7293457943925233 and parameters: {'n_estimators': 409, 'max_depth': 1, 'max_features': 20, 'min_samples_leaf': 78}. Best is trial 0 with value: 0.7293457943925233.
[I 2024-01-30 17:23:05,578] Trial 1 finished with value: 0.7742056074766355 and parameters: {'n_estimators': 299, 'max_depth': 7, 'max_features': 6, 'min_samples_leaf': 79}. Best is trial 1 with value: 0.7742056074766355.


[LightGBM] [Info] Number of positive: 5971, number of negative: 7400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 885
[LightGBM] [Info] Number of data points in the train set: 13371, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.446563 -> initscore=-0.214566
[LightGBM] [Info] Start training from score -0.214566


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2024-01-30 17:23:11,553] A new study created in memory with name: no-name-2fbe94fa-4558-4448-aa8c-e475d193bafe
  'lam': trial.suggest_loguniform('lam', 1e-3, 1)}


Accuracy logistic regression:  0.7870176488184266
Accuracy boosted trees:  0.8232126832186659
Accuracy random forest:  0.8196230930302124


  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
[I 2024-01-30 17:23:11,873] Trial 0 finished with value: 0.6919626168224299 and parameters: {'n_splines': 17, 'lam': 0.001154132971137168}. Best is trial 0 with value: 0.6919626168224299.
  'lam': trial.suggest_loguniform('lam', 1e-3, 1)}
  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
[I 2024-01-30 17:23:12,109] Trial 1 finished with value: 0.6923364485981308 and parameters: {'n_splines': 15, 'lam': 0.17636469336159113}. Best is trial 1 with value: 0.6923364485981308.


Accuracy GAM:  0.7218067603948549


In [25]:
class DirichletGPModel(gpytorch.models.ExactGP):
    
    def __init__(self, train_x, train_y, likelihood, num_classes, kernel):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    
# Define the learning params
training_iterations = 1

# Define the kernels
kernels = [
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=2.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=X_train_.shape[1])),
]

best_accuracy = 0
best_kernel = None

def train(model,X_train_tensor,y_train_tensor):
    iterator = tqdm.tqdm(range(training_iterations), desc="Train")

    for _ in iterator:
        # Zero backprop gradients
        optimizer.zero_grad()
        # Get output from model
        output = model(X_train_tensor)
        # Calc loss and backprop derivatives
        loss = -mll(output, y_train_tensor).sum()
        loss.backward()
        iterator.set_postfix(loss=loss.item())
        optimizer.step()
        torch.cuda.empty_cache()

for kernel in kernels:
    # Initialize the Gaussian Process model and likelihood
    likelihood = gpytorch.likelihoods.DirichletClassificationLikelihood(y_train__tensor.long(), learn_additional__noise=True)
    model = DirichletGPModel(X_train__tensor, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes, kernel=kernel)

    if torch.cuda.is_available():
        model = model.cuda()

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Train the model
    train(model,X_train__tensor,y_train__tensor.long())
    
    # Set the model in evaluation mode
    model.eval()
    likelihood.eval()

    # Make predictions on the validation set
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        output = model(X_val_tensor)
        preds = likelihood(output)

    # Calculate accuracy
    accuracy = accuracy_score(y_val_tensor, preds.mean.ge(0.5).float())

    # Update the best kernel if the current kernel has a higher accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_kernel = kernel

        

class DirichletGPModel(gpytorch.models.ExactGP):
    
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = best_kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Initialize the Gaussian Process model and likelihood
likelihood = gpytorch.likelihoods.DirichletClassificationLikelihood(y_train_tensor.long(), learn_additional_noise=True)
model = DirichletGPModel(X_train_tensor, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)

if torch.cuda.is_available():
    model = model.cuda()


# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

# Train the model
train(model,X_train_tensor,y_train_tensor.long())

# Set the model in evaluation mode
model.eval()
likelihood.eval()

# Make predictions on the validation set
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    output = model(X_test_tensor)
    preds = likelihood(output)

# Calculate accuracy
accuracy_GP = accuracy_score(y_test_tensor, preds.mean.argmax(dim=1))
print("Accuracy GP: ", accuracy_GP)

Train:   0%|          | 0/1 [00:00<?, ?it/s]



ValueError: Found input variables with inconsistent numbers of samples: [2675, 2]

In [26]:
X_train_tensor.shape

torch.Size([13371, 10])

In [27]:
def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01, log=True),
              'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
              'num_layer': trial.suggest_int('num_layer', 2, 5),
              'hidden_dim': trial.suggest_int('hidden_dim', 100, 500),}
    params['noise_dim']=params['hidden_dim']

    # Check if CUDA is available and if so, move the tensors and the model to the GPU
    if torch.cuda.is_available():
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, sigmoid=True, device="cuda")
    else: 
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, sigmoid=True)
    
    # Generate a sample from the engression model for each data point
    y_val_hat_engression=engressor_model.predict(X_val_tensor, target="mean")
    y_val_hat_engression = y_val_hat_engression.ge(0.5).float()  # Apply threshold to get binary predictions

    accuracy_engression = accuracy_score(y_val_tensor.cpu().numpy(), y_val_hat_engression.cpu().numpy())  # Calculate accuracy

    return accuracy_engression

sampler_engression = optuna.samplers.TPESampler(seed=seed)
study_engression = optuna.create_study(sampler=sampler_engression, direction='maximize')  # We want to maximize accuracy
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)

[I 2024-01-30 18:17:16,304] A new study created in memory with name: no-name-e3e67ada-b8b6-49b3-9383-6a9426722612


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 11] energy-loss: 0.1685,  E(|Y-Yhat|): 0.3708,  E(|Yhat-Yhat'|): 0.4045
[Epoch 100 (84%), batch 11] energy-loss: 0.1775,  E(|Y-Yhat|): 0.3623,  E(|Yhat-Yhat'|): 0.3696

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1698,  E(|Y-Yhat|): 0.3465,  E(|Yhat-Yhat'|): 0.3535

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-01-30 18:21:43,643] Trial 0 finished with value: 0.8 and parameters: {'learning_rate': 0.0034885205571560775, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 400}. Best is trial 0 with value: 0.8.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 11] energy-loss: 0.3275,  E(|Y-Yhat|): 0.4255,  E(|Yhat-Yhat'|): 0.1961
[Epoch 100 (33%), batch 11] energy-loss: 0.1882,  E(|Y-Yhat|): 0.3657,  E(|Yhat-Yhat'|): 0.3549
[Epoch 200 (66%), batch 11] energy-loss: 0.1783,  E(|Y-Yhat|): 0.3528,  E(|Yhat-Yhat'|): 0.3490
[Epoch 300 (99%), batch 11] energy-loss: 0.1864,  E(|Y-Yhat|): 0.3561,  E(|Yhat-Yhat'|): 0.3394

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1735,  E(|Y-Yhat|): 0.3456,  E(|Yhat-Yhat'|): 0.3441

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-01-30 18:25:17,712] Trial 1 finished with value: 0.8213084112149532 and parameters: {'learning_rate': 0.000993148119483195, 'num_epoches': 302, 'num_layer': 2, 'hidden_dim': 404}. Best is trial 1 with value: 0.8213084112149532.


In [31]:
# Engression model
params=study_engression.best_params
params['noise_dim']=params['hidden_dim']
# Check if CUDA is available and if so, move the tensors and the model to the GPU
if torch.cuda.is_available():
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, sigmoid=True, device="cuda")
else: 
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, sigmoid=True)
# Assuming the model outputs probabilities for the two classes
y_test_hat_engression=engressor_model.predict(X_test_tensor, target="mean")
# Convert the probabilities to class labels
y_test_hat_engression = y_test_hat_engression.ge(0.5).float()  # Apply threshold to get binary predictions
accuracy_engression = accuracy_score(y_test_tensor.cpu().numpy(), y_test_hat_engression.cpu().numpy())  # Calculate accuracy

print("Accuracy logistic regression: ", accuracy_logreg)
print("Accuracy boosted trees: ", accuracy_boosted)
print("Accuracy random forest: ", accuracy_rf)
print("Accuracy engression: ", accuracy_engression)

Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 14] energy-loss: 0.3112,  E(|Y-Yhat|): 0.4466,  E(|Yhat-Yhat'|): 0.2709
[Epoch 100 (33%), batch 14] energy-loss: 0.1418,  E(|Y-Yhat|): 0.3083,  E(|Yhat-Yhat'|): 0.3330
[Epoch 200 (66%), batch 14] energy-loss: 0.1566,  E(|Y-Yhat|): 0.3433,  E(|Yhat-Yhat'|): 0.3733
[Epoch 300 (99%), batch 14] energy-loss: 0.1661,  E(|Y-Yhat|): 0.3170,  E(|Yhat-Yhat'|): 0.3019

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1656,  E(|Y-Yhat|): 0.3354,  E(|Yhat-Yhat'|): 0.3396

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [32]:
y_test_hat_engression = y_test_hat_engression.ge(0.5).float()  # Apply threshold to get binary predictions
accuracy_engression = accuracy_score(y_test_tensor.cpu().numpy(), y_test_hat_engression.cpu().numpy())  # Calculate accuracy

print("Accuracy logistic regression: ", accuracy_logreg)
print("Accuracy boosted trees: ", accuracy_boosted)
print("Accuracy random forest: ", accuracy_rf)
print("Accuracy engression: ", accuracy_engression)

Accuracy logistic regression:  0.7870176488184266
Accuracy boosted trees:  0.8232126832186659
Accuracy random forest:  0.8196230930302124
Accuracy engression:  0.8175291654202812


In [33]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
from properscoring import crps_gaussian, crps_ensemble
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LinearGAM, s, f

SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361072
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X.values]

mahalanobis_dist=pd.Series(mahalanobis_dist,index=X.index)
far_index=mahalanobis_dist.index[np.where(mahalanobis_dist>=np.quantile(mahalanobis_dist,0.8))[0]]
close_index=mahalanobis_dist.index[np.where(mahalanobis_dist<np.quantile(mahalanobis_dist,0.8))[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

mean = np.mean(X_train, axis=0)
cov = np.cov(X_train.T)

# calculate the Mahalanobis distance for each data point
mahalanobis_dist_ = [mahalanobis(x, mean, np.linalg.inv(cov)) for x in X_train.values]

mahalanobis_dist_=pd.Series(mahalanobis_dist_,index=X_train.index)
far_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_>=np.quantile(mahalanobis_dist_,0.8))[0]]
close_index_=mahalanobis_dist_.index[np.where(mahalanobis_dist_<np.quantile(mahalanobis_dist_,0.8))[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

#### Gaussian process
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, kernel):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Define the learning params
training_iterations = 3 #1000

# Define the kernels
kernels = [
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=2.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=X_train_.shape[1])),
]

best_RMSE = float('inf')
best_kernel = None

def train(model,X_train_tensor,y_train_tensor):
    iterator = tqdm.tqdm(range(training_iterations), desc="Train")

    for _ in iterator:
        # Zero backprop gradients
        optimizer.zero_grad()
        # Get output from model
        output = model(X_train_tensor)
        # Calc loss and backprop derivatives
        loss = -mll(output, y_train_tensor)
        loss.backward()
        iterator.set_postfix(loss=loss.item())
        optimizer.step()
        torch.cuda.empty_cache()

for kernel in kernels:
    # Initialize the Gaussian Process model and likelihood
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(X_train__tensor, y_train__tensor, likelihood, kernel)

    if torch.cuda.is_available():
        model = model.cuda()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Train the model
    train(model,X_train__tensor,y_train__tensor)
    
    # Set the model in evaluation mode
    model.eval()
    likelihood.eval()

    # Make predictions on the validation set
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        y_pred = model(X_val_tensor)

    # Calculate RMSE
    RMSE = torch.sqrt(torch.mean(torch.square(y_val_tensor - y_pred.mean)))

    # Update the best kernel if the current kernel has a lower RMSE
    if RMSE < best_RMSE:
        best_RMSE = RMSE
        best_kernel = kernel


# Set the random seed for reproducibility

class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = best_kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Define the learning params
training_iterations = 3 #1000

# Initialize the Gaussian Process model and likelihood
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(X_train_tensor, y_train_tensor, likelihood)

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

if torch.cuda.is_available():
    model = model.cuda()

# Train the model
train(model,X_train_tensor,y_train_tensor)

# Set the model in evaluation mode
model.eval()
likelihood.eval()

# Make predictions on the validation set
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    y_pred = model(X_test_tensor)

# Calculate RMSE
RMSE_GP = torch.sqrt(torch.mean(torch.square(y_test_tensor - y_pred.mean)))
print("RMSE GP: ", RMSE_GP)



Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

RMSE GP:  tensor(71.9368)


In [34]:
#### Gaussian process
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, kernel):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Define the learning params
training_iterations = 3 #1000

# Define the kernels
kernels = [
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=2.5, ard_num_dims=X_train_.shape[1])),
    gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=X_train_.shape[1])),
]

best_RMSE = float('inf')
best_kernel = None

def train(model,X_train_tensor,y_train_tensor):
    iterator = tqdm.tqdm(range(training_iterations), desc="Train")

    for _ in iterator:
        # Zero backprop gradients
        optimizer.zero_grad()
        # Get output from model
        output = model(X_train_tensor)
        # Calc loss and backprop derivatives
        loss = -mll(output, y_train_tensor)
        loss.backward()
        iterator.set_postfix(loss=loss.item())
        optimizer.step()
        torch.cuda.empty_cache()

for kernel in kernels:
    # Initialize the Gaussian Process model and likelihood
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(X_train__tensor, y_train__tensor, likelihood, kernel)

    if torch.cuda.is_available():
        model = model.cuda()

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Train the model
    train(model,X_train__tensor,y_train__tensor)
    
    # Set the model in evaluation mode
    model.eval()
    likelihood.eval()

    # Make predictions on the validation set
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        y_pred = model(X_val_tensor)

    # Calculate RMSE
    RMSE = torch.sqrt(torch.mean(torch.square(y_val_tensor - y_pred.mean)))

    # Update the best kernel if the current kernel has a lower RMSE
    if RMSE < best_RMSE:
        best_RMSE = RMSE
        best_kernel = kernel


# Set the random seed for reproducibility

class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = best_kernel

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Define the learning params
training_iterations = 3 #1000

# Initialize the Gaussian Process model and likelihood
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(X_train_tensor, y_train_tensor, likelihood)

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

if torch.cuda.is_available():
    model = model.cuda()

# Train the model
train(model,X_train_tensor,y_train_tensor)

# Set the model in evaluation mode
model.eval()
likelihood.eval()

# Make predictions on the validation set
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    y_pred = model(X_test_tensor)

# Calculate RMSE
RMSE_GP = torch.sqrt(torch.mean(torch.square(y_test_tensor - y_pred.mean)))
print("RMSE GP: ", RMSE_GP)

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/3 [00:00<?, ?it/s]

RMSE GP:  tensor(71.9368)


In [20]:
from umap import UMAP
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import lightgbmlss
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
from properscoring import crps_gaussian, crps_ensemble
import random
import gpytorch
import tqdm.auto as tqdm
from lightgbmlss.model import *
from lightgbmlss.distributions.Gaussian import *
from drf import drf
import os
from pygam import LinearGAM, s, f
from sklearn.metrics.pairwise import euclidean_distances

#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361093
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)


# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# Apply UMAP decomposition
umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X)

# calculate the Euclidean distance matrix
euclidean_dist_matrix = euclidean_distances(X_umap)

# calculate the Euclidean distance for each data point
euclidean_dist = np.mean(euclidean_dist_matrix, axis=1)

euclidean_dist = pd.Series(euclidean_dist, index=X.index)
far_index = euclidean_dist.index[np.where(euclidean_dist >= np.quantile(euclidean_dist, 0.8))[0]]
close_index = euclidean_dist.index[np.where(euclidean_dist < np.quantile(euclidean_dist, 0.8))[0]]

X_train = X.loc[close_index,:]

# Apply UMAP decomposition on the training set
X_umap_train = umap.fit_transform(X_train)

# calculate the Euclidean distance matrix for the training set
euclidean_dist_matrix_train = euclidean_distances(X_umap_train)

# calculate the Euclidean distance for each data point in the training set
euclidean_dist_train = np.mean(euclidean_dist_matrix_train, axis=1)

euclidean_dist_train = pd.Series(euclidean_dist_train, index=X_train.index)
far_index_train = euclidean_dist_train.index[np.where(euclidean_dist_train >= np.quantile(euclidean_dist_train, 0.8))[0]]
close_index_train = euclidean_dist_train.index[np.where(euclidean_dist_train < np.quantile(euclidean_dist_train, 0.8))[0]]


# Convert data to PyTorch tensors
# Modify X_train_, X_val, X_train, and X_test to have dummy variables
X = pd.get_dummies(X.astype(str), drop_first=True)

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

X_train_ = X_train.loc[close_index_train,:]
X_val = X_train.loc[far_index_train,:]
y_train_ = y_train.loc[close_index_train]
y_val = y_train.loc[far_index_train]

# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.
n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.
n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.


In [21]:
N_TRIALS=5

In [22]:
def boosted(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
              'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    boosted_tree_model=lgbm.LGBMRegressor(**params)
    boosted_tree_model.fit(X_train_, y_train_)
    y_val_hat_boost=boosted_tree_model.predict(X_val)
    RMSE_boost=np.sqrt(np.mean((y_val-y_val_hat_boost)**2))

    return RMSE_boost

sampler_boost = optuna.samplers.TPESampler(seed=seed)
study_boost = optuna.create_study(sampler=sampler_boost, direction='minimize')
study_boost.optimize(boosted, n_trials=N_TRIALS)
boosted_model=lgbm.LGBMRegressor(**study_boost.best_params)

def rf(trial):

    params = {'n_estimators': trial.suggest_int('n_estimators', 100, 500),
              'max_depth': trial.suggest_int('max_depth', 1, 30),
              'max_features': trial.suggest_int('max_features', 1, 30),
              'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100)}
    
    rf_model=RandomForestRegressor(**params)
    rf_model.fit(X_train_, y_train_)
    y_val_hat_rf=rf_model.predict(X_val)
    RMSE_rf=np.sqrt(np.mean((y_val-y_val_hat_rf)**2))

    return RMSE_rf

sampler_rf = optuna.samplers.TPESampler(seed=seed)
study_rf = optuna.create_study(sampler=sampler_rf, direction='minimize')
study_rf.optimize(rf, n_trials=N_TRIALS)
rf_model=RandomForestRegressor(**study_rf.best_params)


def engressor_NN(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01, log=True),
              'num_epoches': trial.suggest_int('num_epoches', 100, 1000),
              'num_layer': trial.suggest_int('num_layer', 2, 5),
              'hidden_dim': trial.suggest_int('hidden_dim', 100, 500),}
    params['noise_dim']=params['hidden_dim']

    # Check if CUDA is available and if so, move the tensors and the model to the GPU
    if torch.cuda.is_available():
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, device="cuda")
    else: 
        engressor_model=engression(X_train__tensor, y_train__tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
    
    # Generate a sample from the engression model for each data point
    y_val_hat_engression=engressor_model.predict(X_val_tensor, target="mean")
    RMSE_engression=torch.sqrt(torch.mean(torch.square(y_val_tensor.reshape(-1,1) - y_val_hat_engression)))

    return RMSE_engression

sampler_engression = optuna.samplers.TPESampler(seed=seed)
study_engression = optuna.create_study(sampler=sampler_engression, direction='minimize')
study_engression.optimize(engressor_NN, n_trials=N_TRIALS)


boosted_model.fit(X_train, y_train)
y_test_hat_boosted=boosted_model.predict(X_test)
RMSE_boosted=np.sqrt(np.mean((y_test-y_test_hat_boosted)**2))

rf_model.fit(X_train, y_train)
y_test_hat_rf=rf_model.predict(X_test)
RMSE_rf=np.sqrt(np.mean((y_test-y_test_hat_rf)**2))

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)
y_test_hat_linreg=lin_reg.predict(X_test)
RMSE_linreg=np.sqrt(np.mean((y_test-y_test_hat_linreg)**2))

params=study_engression.best_params
params['noise_dim']=params['hidden_dim']
# Check if CUDA is available and if so, move the tensors and the model to the GPU
if torch.cuda.is_available():
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000, device="cuda")
else: 
    engressor_model=engression(X_train_tensor, y_train_tensor.reshape(-1,1), lr=params['learning_rate'], num_epoches=params['num_epoches'],num_layer=params['num_layer'], hidden_dim=params['hidden_dim'], noise_dim=params['noise_dim'], batch_size=1000)
y_test_hat_engression=engressor_model.predict(X_test_tensor, target="mean")
RMSE_engression=torch.sqrt(torch.mean(torch.square(y_test_tensor.reshape(-1,1) - y_test_hat_engression)))


print("RMSE linear regression: ",RMSE_linreg)
print("RMSE boosted trees", RMSE_boosted)
print("RMSE random forest", RMSE_rf)
print("RMSE engression", RMSE_engression)

[I 2024-02-02 18:12:10,537] A new study created in memory with name: no-name-a791fa4b-af88-403d-afc4-ceadc6387b1d
[I 2024-02-02 18:12:11,182] Trial 0 finished with value: 0.28205499224360375 and parameters: {'learning_rate': 0.12071779104534666, 'n_estimators': 108, 'reg_lambda': 0.005044685709888605, 'max_depth': 23, 'min_child_samples': 55}. Best is trial 0 with value: 0.28205499224360375.
[I 2024-02-02 18:12:11,229] Trial 1 finished with value: 0.29362264402953486 and parameters: {'learning_rate': 0.004043145805966843, 'n_estimators': 179, 'reg_lambda': 0.0699481785242808, 'max_depth': 6, 'min_child_samples': 18}. Best is trial 0 with value: 0.28205499224360375.
[I 2024-02-02 18:12:11,360] Trial 2 finished with value: 0.2875465361619137 and parameters: {'learning_rate': 0.07075637776590661, 'n_estimators': 482, 'reg_lambda': 1.08526150100961e-08, 'max_depth': 16, 'min_child_samples': 83}. Best is trial 0 with value: 0.28205499224360375.
[I 2024-02-02 18:12:11,473] Trial 3 finished w

Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 3] energy-loss: 0.4471,  E(|Y-Yhat|): 0.9615,  E(|Yhat-Yhat'|): 1.0288
[Epoch 100 (84%), batch 3] energy-loss: 0.0536,  E(|Y-Yhat|): 0.1128,  E(|Yhat-Yhat'|): 0.1184

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1241,  E(|Y-Yhat|): 0.1767,  E(|Yhat-Yhat'|): 0.1053

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-02-02 18:13:10,130] Trial 0 finished with value: 873.9304809570312 and parameters: {'learning_rate': 0.0034885205571560775, 'num_epoches': 118, 'num_layer': 4, 'hidden_dim': 400}. Best is trial 0 with value: 873.9304809570312.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 3] energy-loss: 0.3969,  E(|Y-Yhat|): 0.6212,  E(|Yhat-Yhat'|): 0.4485
[Epoch 100 (33%), batch 3] energy-loss: 0.0832,  E(|Y-Yhat|): 0.1829,  E(|Yhat-Yhat'|): 0.1994
[Epoch 200 (66%), batch 3] energy-loss: 0.0406,  E(|Y-Yhat|): 0.1058,  E(|Yhat-Yhat'|): 0.1303
[Epoch 300 (99%), batch 3] energy-loss: 0.0330,  E(|Y-Yhat|): 0.0883,  E(|Yhat-Yhat'|): 0.1105

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.0214,  E(|Y-Yhat|): 0.0527,  E(|Yhat-Yhat'|): 0.0625

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-02-02 18:14:05,414] Trial 1 finished with value: 421.4085388183594 and parameters: {'learning_rate': 0.000993148119483195, 'num_epoches': 302, 'num_layer': 2, 'hidden_dim': 404}. Best is trial 1 with value: 421.4085388183594.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 3] energy-loss: 0.4837,  E(|Y-Yhat|): 0.6800,  E(|Yhat-Yhat'|): 0.3925
[Epoch 100 (55%), batch 3] energy-loss: 0.0713,  E(|Y-Yhat|): 0.1832,  E(|Yhat-Yhat'|): 0.2239

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1023,  E(|Y-Yhat|): 0.1655,  E(|Yhat-Yhat'|): 0.1264

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 


[I 2024-02-02 18:15:44,552] Trial 2 finished with value: 2361.20751953125 and parameters: {'learning_rate': 0.00021788216053884017, 'num_epoches': 179, 'num_layer': 4, 'hidden_dim': 482}. Best is trial 1 with value: 421.4085388183594.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 3] energy-loss: 0.4192,  E(|Y-Yhat|): 0.6044,  E(|Yhat-Yhat'|): 0.3705
[Epoch 100 (18%), batch 3] energy-loss: 0.0742,  E(|Y-Yhat|): 0.2349,  E(|Yhat-Yhat'|): 0.3214
[Epoch 200 (35%), batch 3] energy-loss: 0.0583,  E(|Y-Yhat|): 0.1741,  E(|Yhat-Yhat'|): 0.2316
[Epoch 300 (53%), batch 3] energy-loss: 0.0432,  E(|Y-Yhat|): 0.1381,  E(|Yhat-Yhat'|): 0.1899
[Epoch 400 (71%), batch 3] energy-loss: 0.0400,  E(|Y-Yhat|): 0.1147,  E(|Yhat-Yhat'|): 0.1493
[Epoch 500 (89%), batch 3] energy-loss: 0.0421,  E(|Y-Yhat|): 0.1125,  E(|Yhat-Yhat'|): 0.1409

Training loss 

[I 2024-02-02 18:20:46,277] Trial 3 finished with value: 806.5403442382812 and parameters: {'learning_rate': 0.00010183487453386067, 'num_epoches': 561, 'num_layer': 5, 'hidden_dim': 345}. Best is trial 1 with value: 421.4085388183594.


Running on CPU.

Residual blocks (skip-connections) are typically recommended for more than 2 layers; turn it on by setting resblock=True.
Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 3] energy-loss: 0.4663,  E(|Y-Yhat|): 0.8335,  E(|Yhat-Yhat'|): 0.7344
[Epoch 100 (27%), batch 3] energy-loss: 0.0273,  E(|Y-Yhat|): 0.0838,  E(|Yhat-Yhat'|): 0.1129
[Epoch 200 (55%), batch 3] energy-loss: 0.0690,  E(|Y-Yhat|): 0.1447,  E(|Yhat-Yhat'|): 0.1516
[Epoch 300 (83%), batch 3] energy-loss: 0.0280,  E(|Y-Yhat|): 0.0770,  E(|Yhat-Yhat'|): 0.0980

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.1706,  E(|Y-Yhat|): 0.2243,  E(|Yhat-Yhat'|): 0.1074

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally b

[I 2024-02-02 18:24:39,617] Trial 4 finished with value: 471.7718200683594 and parameters: {'learning_rate': 0.0027765828373405767, 'num_epoches': 362, 'num_layer': 5, 'hidden_dim': 386}. Best is trial 1 with value: 421.4085388183594.


Running on CPU.

Data is standardized for training only; the printed training losses are on the standardized scale. 
However during evaluation, the predictions, evaluation metrics, and plots will be on the original scale.

Training based on mini-batch gradient descent with a batch size of 1000.
[Epoch 1 (0%), batch 4] energy-loss: 0.3629,  E(|Y-Yhat|): 0.6052,  E(|Yhat-Yhat'|): 0.4847
[Epoch 100 (33%), batch 4] energy-loss: 0.0492,  E(|Y-Yhat|): 0.1462,  E(|Yhat-Yhat'|): 0.1940
[Epoch 200 (66%), batch 4] energy-loss: 0.0794,  E(|Y-Yhat|): 0.1462,  E(|Yhat-Yhat'|): 0.1336
[Epoch 300 (99%), batch 4] energy-loss: 0.0403,  E(|Y-Yhat|): 0.0967,  E(|Yhat-Yhat'|): 0.1130

Training loss on the original (non-standardized) scale:
	Energy-loss: 0.0213,  E(|Y-Yhat|): 0.0510,  E(|Yhat-Yhat'|): 0.0595

Prediction-loss E(|Y-Yhat|) and variance-loss E(|Yhat-Yhat'|) should ideally be equally large
-- consider training for more epochs or adjusting hyperparameters if there is a mismatch 
RMSE linear regr

In [None]:
#### GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
              'lam': trial.suggest_loguniform('lam', 1e-3, 1)}

    # Create and train the model
    gam = LinearGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the RMSE
    y_val_hat_gam = gam.predict(X_val)
    RMSE_gam = np.sqrt(np.mean((y_val - y_val_hat_gam) ** 2))

    return RMSE_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='minimize')

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LinearGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict(X_test)
# Calculate the RMSE
RMSE_gam = np.sqrt(np.mean((y_test - y_test_hat_gam) ** 2))
print("RMSE GAM: ", RMSE_gam)

In [17]:
print(X_train.shape)
print(X_train_.shape)
print(X_val.shape)
print(X_test.shape)

(3241, 49)
(2592, 49)
(649, 49)
(811, 49)


In [18]:
X.shape

(4052, 49)

In [19]:
#### GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
              'lam': trial.suggest_loguniform('lam', 1e-3, 1)}

    # Create and train the model
    gam = LinearGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the CRPS
    y_val_hat_gam = gam.predict(X_val)
    std_dev_error = np.std(y_val - y_val_hat_gam)
    crps_gam = [crps_gaussian(y_val_np[i], mu=y_val_hat_gam[i], sig=std_dev_error) for i in range(len(y_val_hat_gam))]
    crps_gam = np.mean(crps_gam)

    return crps_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='minimize')

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LinearGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict(X_test)

# Calculate the CRPS
std_dev_error = np.std(y_test - y_test_hat_gam)
crps_gam = [crps_gaussian(y_test_np[i], mu=y_test_hat_gam[i], sig=std_dev_error) for i in range(len(y_test_hat_gam))]
crps_gam = np.mean(crps_gam)
print("CRPS GAM: ", crps_gam)

[I 2024-02-02 12:44:44,634] A new study created in memory with name: no-name-42393d17-3449-4f52-aac7-534dc03deab4
suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.
[I 2024-02-02 12:44:44,674] Trial 0 finished with value: 0.17618370891210408 and parameters: {'n_splines': 17, 'lam': 0.001154132971137168}. Best is trial 0 with value: 0.17618370891210408.
suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.
[I 2024-02-02 12:44:44,707] Trial 1 finished with value: 0.1761837089120805 and parameters: {'n_splines': 15, 'lam': 0.17636469336159113}. Best is trial 1 with value: 0.1761837089120805.
suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/re

CRPS GAM:  0.33310733322794117


In [7]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score
import gower


#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361110
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)



In [10]:
# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# Compute Gower distance and define train and test set
# calculate the Gower distance matrix
X_gower = X.copy()

for col in X_gower.select_dtypes(['category']).columns:
    X_gower[col] = X_gower[col].astype('object')

gower_dist_matrix = gower.gower_matrix(X_gower)

# calculate the Gower distance for each data point
gower_dist = np.mean(gower_dist_matrix, axis=1)

gower_dist=pd.Series(gower_dist,index=X.index)
far_index=gower_dist.index[np.where(gower_dist>=np.quantile(gower_dist,0.8))[0]]
close_index=gower_dist.index[np.where(gower_dist<np.quantile(gower_dist,0.8))[0]]

X_train = X.loc[close_index,:]
X_gower_ = X_train.copy()

for col in X_gower_.select_dtypes(['category']).columns:
    X_gower_[col] = X_gower_[col].astype('object')

# calculate the Gower distance matrix for the training set
gower_dist_matrix_train = gower.gower_matrix(X_gower_)

# calculate the Gower distance for each data point in the training set
gower_dist_train = np.mean(gower_dist_matrix_train, axis=1)

gower_dist_train=pd.Series(gower_dist_train,index=X_train.index)
far_index_train=gower_dist_train.index[np.where(gower_dist_train>=np.quantile(gower_dist_train,0.8))[0]]
close_index_train=gower_dist_train.index[np.where(gower_dist_train<np.quantile(gower_dist_train,0.8))[0]]


# Convert data to PyTorch tensors
# Modify X_train_, X_val, X_train, and X_test to have dummy variables
X = pd.get_dummies(X.astype(str), drop_first=True)

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

X_train_ = X_train.loc[close_index_train,:]
X_val = X_train.loc[far_index_train,:]
y_train_ = y_train.loc[close_index_train]
y_val = y_train.loc[far_index_train]

# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1835201436 bytes.

In [8]:
X.shape

(38474, 8)

In [9]:
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)

In [5]:
benchmark_suite.tasks

[361110, 361111, 361113, 361282, 361283, 361285, 361286]