In [85]:
import pandas as pd
import numpy as np
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import optuna
from sklearn.ensemble import RandomForestRegressor
from engression import engression
import torch
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import re
import os
from pygam import LinearGAM
import gower
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping
from torch.utils.data import TensorDataset, DataLoader
import shutil
import gpboost as gpb

#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361287

# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
PATIENCE=40
N_EPOCHS=1000
GP_ITERATIONS=1000
BATCH_SIZE=1024
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)

CHECKPOINT_PATH = f'CHECKPOINTS/GOWER/task_{task_id}.pt'

print(f"Task {task_id}")

task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

if task_id==361099:
    y=np.log(y)

if len(X) > 15000:
    indices = np.random.choice(X.index, size=15000, replace=False)
    X = X.iloc[indices,]
    y = y[indices]

# Remove categorical columns with more than 20 unique values and non-categorical columns with less than 10 unique values
# Remove non-categorical columns with more than 70% of the data in one category from X_clean
for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if indicator]:
    if len(X[col].unique()) > 20:
        X = X.drop(col, axis=1)

X_clean=X.copy()
for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if not indicator]:
    if len(X[col].unique()) < 10:
        X = X.drop(col, axis=1)
        X_clean = X_clean.drop(col, axis=1)
    elif X[col].value_counts(normalize=True).max() > 0.7:
        X_clean = X_clean.drop(col, axis=1)

# Find features with absolute correlation > 0.9
corr_matrix = X_clean.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

# Drop one of the highly correlated features from X_clean
X_clean = X_clean.drop(high_corr_features, axis=1)

# Rename columns to avoid problems with LGBM
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Compute Gower distance and define train and test set
# calculate the Gower distance matrix for the entire dataset
for col in X_clean.select_dtypes(['category']).columns:
    X_clean[col] = X_clean[col].astype('object')

gower_dist_matrix = gower.gower_matrix(X_clean)

# calculate the Gower distance for each data point
gower_dist = np.mean(gower_dist_matrix, axis=1)

gower_dist=pd.Series(gower_dist,index=X_clean.index)
far_index=gower_dist.index[np.where(gower_dist>=np.quantile(gower_dist,0.8))[0]]
close_index=gower_dist.index[np.where(gower_dist<np.quantile(gower_dist,0.8))[0]]

X_clean_ = X_clean.loc[close_index,:]

for col in X_clean_.select_dtypes(['category']).columns:
    X_clean_[col] = X_clean_[col].astype('object')

# calculate the Gower distance matrix for the training set
gower_dist_matrix_train = gower.gower_matrix(X_clean_)

# calculate the Gower distance for each data point in the training set
gower_dist_train = np.mean(gower_dist_matrix_train, axis=1)

gower_dist_train=pd.Series(gower_dist_train,index=X_clean_.index)
far_index_train=gower_dist_train.index[np.where(gower_dist_train>=np.quantile(gower_dist_train,0.8))[0]]
close_index_train=gower_dist_train.index[np.where(gower_dist_train<np.quantile(gower_dist_train,0.8))[0]]

# Convert data to PyTorch tensors
# Modify X_train_, X_val, X_train, and X_test to have dummy variables
non_dummy_cols = X.select_dtypes(exclude=['bool', 'category', 'object', 'string']).columns
X = pd.get_dummies(X, drop_first=True).astype('float32')

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

X_train_ = X_train.loc[close_index_train,:]
X_val = X_train.loc[far_index_train,:]
y_train_ = y_train.loc[close_index_train]
y_val = y_train.loc[far_index_train]

# Standardize the data for non-dummy variables
mean_X_train_ = np.mean(X_train_[non_dummy_cols], axis=0)
std_X_train_ = np.std(X_train_[non_dummy_cols], axis=0)
X_train_[non_dummy_cols] = (X_train_[non_dummy_cols] - mean_X_train_) / std_X_train_
X_val = X_val.copy()
X_val[non_dummy_cols] = (X_val[non_dummy_cols] - mean_X_train_) / std_X_train_

mean_X_train = np.mean(X_train[non_dummy_cols], axis=0)
std_X_train = np.std(X_train[non_dummy_cols], axis=0)
X_train[non_dummy_cols] = (X_train[non_dummy_cols] - mean_X_train) / std_X_train
X_test = X_test.copy()
X_test[non_dummy_cols] = (X_test[non_dummy_cols] - mean_X_train) / std_X_train

# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    print("Using GPU")
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()
else:
    print("Using CPU")

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

# Create TensorDatasets for training and validation sets
train__dataset = TensorDataset(X_train__tensor, y_train__tensor)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for training and validation sets
train__loader = DataLoader(train__dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define d_out and d_in
d_out = 1  
d_in=X_train_.shape[1]

Task 361287




Using CPU


In [94]:
X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

if task_id==361099:
    y=np.log(y)

if len(X) > 15000:
    indices = np.random.choice(X.index, size=15000, replace=False)
    X = X.iloc[indices,]
    y = y[indices]

# Remove categorical columns with more than 20 unique values and non-categorical columns with less than 10 unique values
# Remove non-categorical columns with more than 70% of the data in one category from X_clean
for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if indicator]:
    if len(X[col].unique()) > 20:
        X = X.drop(col, axis=1)

X_clean=X.copy()
for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if not indicator]:
    if len(X[col].unique()) < 10:
        X = X.drop(col, axis=1)
        X_clean = X_clean.drop(col, axis=1)
    elif X[col].value_counts(normalize=True).max() > 0.7:
        X_clean = X_clean.drop(col, axis=1)

# Find features with absolute correlation > 0.9
corr_matrix = X_clean.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

# Drop one of the highly correlated features from X_clean
X_clean = X_clean.drop(high_corr_features, axis=1)

# Rename columns to avoid problems with LGBM
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [95]:
X

Unnamed: 0,oz1,oz2,oz3,oz4,oz5,oz6,oz7,oz8,oz9,oz10,...,oz254,oz256,oz257,oz258,oz259,oz260,oz261,oz262,oz264,oz265
0,0.106112,0.153159,0.533333,0.177273,0.164345,0.180812,0.188449,0.171053,0.156839,0.366485,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
1,0.106112,0.099068,0.178231,0.181818,0.145179,0.184502,0.181485,0.184211,0.188336,0.620572,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2,0.106112,0.120172,0.317007,0.118182,0.096524,0.110701,0.108506,0.105263,0.102034,0.301090,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
3,0.089232,0.087194,0.193878,0.095455,0.061344,0.092251,0.081890,0.092105,0.081679,0.274523,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
4,0.111846,0.118452,0.271429,0.150000,0.119729,0.143911,0.153152,0.118421,0.164176,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8880,0.455253,0.394826,0.284354,0.527273,0.351773,0.564576,0.442494,0.631579,0.456440,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8881,0.129336,0.161179,0.425850,0.245455,0.226020,0.254613,0.281939,0.263158,0.241231,0.608311,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8882,0.111846,0.155457,0.505442,0.204545,0.209552,0.201107,0.244162,0.197368,0.205962,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8883,0.129336,0.179882,0.531973,0.236364,0.223691,0.239852,0.236435,0.236842,0.214358,0.437330,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0


In [96]:
X_clean

Unnamed: 0,oz1,oz3,oz5,oz10,oz12,oz18,oz24,oz31,oz36,oz38,...,oz216,oz217,oz223,oz224,oz231,oz232,oz239,oz256,oz260,oz265
0,0.106112,0.533333,0.164345,0.366485,0.007072,0.230769,0.001084,0.042617,0.057284,0.000135,...,0.000001,0.000193,0.000000,0.021665,0.000000,0.003564,0.001650,0,0,0
1,0.106112,0.178231,0.145179,0.620572,0.002829,0.239560,0.000974,0.033559,0.018877,0.000286,...,0.000007,0.000656,0.000000,0.024162,0.000000,0.000000,0.000000,0,0,0
2,0.106112,0.317007,0.096524,0.301090,0.041018,0.353846,0.001015,0.082513,0.077102,0.000027,...,0.000000,0.000055,0.000000,0.012414,0.000000,0.000000,0.000000,0,0,0
3,0.089232,0.193878,0.061344,0.274523,0.079208,0.349451,0.000985,0.018710,0.038895,0.000019,...,0.000000,0.000038,0.000000,0.009285,0.000000,0.000000,0.000000,0,0,0
4,0.111846,0.271429,0.119729,0.561308,0.005658,0.320879,0.000973,0.092412,0.065234,0.000085,...,0.000001,0.000236,0.000000,0.029936,0.000000,0.000000,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8880,0.455253,0.284354,0.351773,0.561308,0.000000,0.134066,0.002055,0.165025,0.081000,0.034444,...,0.000008,0.000001,0.140954,0.117352,0.000000,0.000000,0.029703,0,0,0
8881,0.129336,0.425850,0.226020,0.608311,0.000000,0.186813,0.000986,0.038707,0.025471,0.001018,...,0.000038,0.001372,0.000000,0.034126,0.003255,0.008147,0.001320,0,0,0
8882,0.111846,0.505442,0.209552,0.561308,0.001414,0.279121,0.000983,0.092808,0.043313,0.000313,...,0.000007,0.000636,0.000000,0.039725,0.000000,0.002037,0.000000,0,0,0
8883,0.129336,0.531973,0.223691,0.437330,0.001414,0.257143,0.001167,0.106370,0.086353,0.000445,...,0.000002,0.000147,0.100716,0.012114,0.000000,0.000000,0.010891,0,0,0


In [86]:
X_train

Unnamed: 0,oz1,oz2,oz3,oz4,oz5,oz6,oz7,oz8,oz9,oz10,...,oz254,oz257,oz258,oz259,oz261,oz262,oz264,oz256_1,oz260_1,oz265_1
0,-0.884618,-0.739788,0.515927,-0.595265,-0.544639,-0.471132,-0.469190,-0.254175,-0.772635,-0.893560,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
1,-0.884618,-1.522186,-1.937543,-0.537708,-0.797438,-0.425048,-0.550386,-0.085653,-0.362768,1.683122,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
4,-0.805870,-1.241807,-1.293620,-0.940646,-1.133123,-0.931978,-0.880730,-0.928263,-0.677160,1.082130,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
5,-0.805870,-0.976687,-0.494599,-0.422580,-0.501244,-0.355923,-0.149957,-0.085653,-0.218000,1.648582,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
8,-1.341054,-1.580305,-0.527500,-1.631396,-1.350336,-1.623228,-1.431307,-1.602351,-1.485807,-0.409999,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8877,0.445795,0.768337,0.417229,0.613551,0.529935,0.842257,0.407290,0.925466,0.361061,-0.188947,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
8878,-0.565669,-0.247272,0.797940,-0.077199,0.229362,-0.010298,0.400621,0.251378,-0.048585,0.612381,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
8881,-0.565669,-0.623783,-0.226695,0.268182,0.268852,0.450548,0.620842,0.925466,0.325549,1.558784,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0
8882,-0.805870,-0.706548,0.323223,-0.249896,0.051640,-0.217673,0.180387,0.082856,-0.133403,1.082130,...,-0.046927,-0.062632,-0.071735,-0.021859,-0.133986,-0.051635,-0.061955,0.0,0.0,0.0


In [91]:
np.sum(X_clean.dtypes!='float')

3

In [92]:
pd.get_dummies(X_clean, drop_first=True).astype('float32')

Unnamed: 0,oz1,oz3,oz5,oz10,oz12,oz18,oz24,oz31,oz36,oz38,...,oz216,oz217,oz223,oz224,oz231,oz232,oz239,oz256_1,oz260_1,oz265_1
0,0.106112,0.533333,0.164345,0.366485,0.007072,0.230769,0.001084,0.042617,0.057284,0.000135,...,0.000001,0.000193,0.000000,0.021665,0.000000,0.003564,0.001650,0.0,0.0,0.0
1,0.106112,0.178231,0.145179,0.620572,0.002829,0.239560,0.000974,0.033559,0.018877,0.000286,...,0.000007,0.000656,0.000000,0.024162,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,0.106112,0.317007,0.096524,0.301090,0.041018,0.353846,0.001015,0.082513,0.077102,0.000027,...,0.000000,0.000055,0.000000,0.012414,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,0.089232,0.193878,0.061344,0.274523,0.079208,0.349451,0.000985,0.018710,0.038895,0.000019,...,0.000000,0.000038,0.000000,0.009285,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,0.111846,0.271429,0.119729,0.561308,0.005658,0.320879,0.000973,0.092412,0.065234,0.000085,...,0.000001,0.000236,0.000000,0.029936,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8880,0.455253,0.284354,0.351773,0.561308,0.000000,0.134066,0.002055,0.165025,0.081000,0.034444,...,0.000008,0.000001,0.140954,0.117352,0.000000,0.000000,0.029703,0.0,0.0,0.0
8881,0.129336,0.425850,0.226020,0.608311,0.000000,0.186813,0.000986,0.038707,0.025471,0.001018,...,0.000038,0.001372,0.000000,0.034126,0.003255,0.008147,0.001320,0.0,0.0,0.0
8882,0.111846,0.505442,0.209552,0.561308,0.001414,0.279121,0.000983,0.092808,0.043313,0.000313,...,0.000007,0.000636,0.000000,0.039725,0.000000,0.002037,0.000000,0.0,0.0,0.0
8883,0.129336,0.531973,0.223691,0.437330,0.001414,0.257143,0.001167,0.106370,0.086353,0.000445,...,0.000002,0.000147,0.100716,0.012114,0.000000,0.000000,0.010891,0.0,0.0,0.0


In [93]:
X

Unnamed: 0,oz1,oz2,oz3,oz4,oz5,oz6,oz7,oz8,oz9,oz10,...,oz254,oz257,oz258,oz259,oz261,oz262,oz264,oz256_1,oz260_1,oz265_1
0,0.106112,0.153159,0.533333,0.177273,0.164345,0.180812,0.188449,0.171053,0.156839,0.366485,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.106112,0.099068,0.178231,0.181818,0.145179,0.184502,0.181485,0.184211,0.188336,0.620572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.106112,0.120172,0.317007,0.118182,0.096524,0.110701,0.108506,0.105263,0.102034,0.301090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.089232,0.087194,0.193878,0.095455,0.061344,0.092251,0.081890,0.092105,0.081679,0.274523,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.111846,0.118452,0.271429,0.150000,0.119729,0.143911,0.153152,0.118421,0.164176,0.561308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8880,0.455253,0.394826,0.284354,0.527273,0.351773,0.564576,0.442494,0.631579,0.456440,0.561308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8881,0.129336,0.161179,0.425850,0.245455,0.226020,0.254613,0.281939,0.263158,0.241231,0.608311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8882,0.111846,0.155457,0.505442,0.204545,0.209552,0.201107,0.244162,0.197368,0.205962,0.561308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8883,0.129336,0.179882,0.531973,0.236364,0.223691,0.239852,0.236435,0.236842,0.214358,0.437330,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
