In [5]:
import pandas as pd
import numpy as np
import openml
from sklearn.linear_model import LinearRegression 
import lightgbm as lgbm
import optuna
from sklearn.ensemble import RandomForestRegressor
from engression import engression
import torch
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import re
import os
from pygam import LinearGAM
import gower
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping
from torch.utils.data import TensorDataset, DataLoader
import shutil
import gpboost as gpb

task_id = 361093

#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

for task_id in benchmark_suite.tasks:

    # Set the random seed for reproducibility
    N_TRIALS=100
    N_SAMPLES=100
    PATIENCE=40
    N_EPOCHS=1000

    BATCH_SIZE=1024
    seed=10
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)

    CHECKPOINT_PATH = f'CHECKPOINTS/GOWER/task_{task_id}.pt'

    print(f"Task {task_id}")

    task = openml.tasks.get_task(task_id)  # download the OpenML task
    dataset = task.get_dataset()

    X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute)

    if len(X) > 15000:
        indices = np.random.choice(X.index, size=15000, replace=False)
        X = X.iloc[indices,]
        y = y[indices]

    # Remove categorical columns with more than 20 unique values and non-categorical columns with less than 10 unique values
    # Remove non-categorical columns with more than 70% of the data in one category from X_clean
    for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if indicator]:
        if len(X[col].unique()) > 20:
            X = X.drop(col, axis=1)

    X_clean=X.copy()
    for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if not indicator]:
        if len(X[col].unique()) < 10:
            X = X.drop(col, axis=1)
            X_clean = X_clean.drop(col, axis=1)
        elif X[col].value_counts(normalize=True).max() > 0.7:
            X_clean = X_clean.drop(col, axis=1)

    # Find features with absolute correlation > 0.9
    corr_matrix = X_clean.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

    # Drop one of the highly correlated features from X_clean
    X_clean = X_clean.drop(high_corr_features, axis=1)

    # Rename columns to avoid problems with LGBM
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    # Compute Gower distance and define train and test set
    # calculate the Gower distance matrix for the entire dataset
    for col in X_clean.select_dtypes(['category']).columns:
        X_clean[col] = X_clean[col].astype('object')

    gower_dist_matrix = gower.gower_matrix(X_clean)

    # calculate the Gower distance for each data point
    gower_dist = np.mean(gower_dist_matrix, axis=1)

    gower_dist=pd.Series(gower_dist,index=X_clean.index)
    far_index=gower_dist.index[np.where(gower_dist>=np.quantile(gower_dist,0.8))[0]]
    close_index=gower_dist.index[np.where(gower_dist<np.quantile(gower_dist,0.8))[0]]

    X_clean_ = X_clean.loc[close_index,:]

    for col in X_clean_.select_dtypes(['category']).columns:
        X_clean_[col] = X_clean_[col].astype('object')

    # calculate the Gower distance matrix for the training set
    gower_dist_matrix_train = gower.gower_matrix(X_clean_)

    # calculate the Gower distance for each data point in the training set
    gower_dist_train = np.mean(gower_dist_matrix_train, axis=1)

    gower_dist_train=pd.Series(gower_dist_train,index=X_clean_.index)
    far_index_train=gower_dist_train.index[np.where(gower_dist_train>=np.quantile(gower_dist_train,0.8))[0]]
    close_index_train=gower_dist_train.index[np.where(gower_dist_train<np.quantile(gower_dist_train,0.8))[0]]

    # Check if categorical variables have the same cardinality in X and X_train_, and remove the ones that don't
    dummy_cols = X.select_dtypes(['bool', 'category', 'object', 'string']).columns
    X_train = X.loc[close_index,:]
    X_train_ = X_train.loc[close_index_train,:]
    for col in dummy_cols:
        if len(X[col].unique()) != len(X_train_[col].unique()):
            X = X.drop(col, axis=1)

    # Convert data to PyTorch tensors
    # Modify X_train_, X_val, X_train, and X_test to have dummy variables
    non_dummy_cols = X.select_dtypes(exclude=['bool', 'category', 'object', 'string']).columns
    X = pd.get_dummies(X, drop_first=True).astype('float32')

    print(X.dtypes)

Task 361093




Actions_taken       float32
Year_of_decision    float32
Liberal_1           float32
Unanimous_1         float32
dtype: object
Task 361094




northing       float32
easting        float32
resistivity    float32
dtype: object
Task 361096




carat        float32
depth        float32
table        float32
x            float32
y            float32
z            float32
color_1      float32
color_2      float32
color_3      float32
color_4      float32
color_5      float32
color_6      float32
clarity_1    float32
clarity_2    float32
clarity_3    float32
clarity_4    float32
clarity_5    float32
clarity_6    float32
clarity_7    float32
dtype: object
Task 361097




X3_1      float32
X3_2      float32
X3_3      float32
X3_4      float32
X3_5      float32
           ...   
X377_1    float32
X378_1    float32
X379_1    float32
X380_1    float32
X385_1    float32
Length: 270, dtype: object
Task 361098




area                  float32
rooms                 float32
bathroom              float32
parking_spaces        float32
hoa_BRL               float32
rent_amount_BRL       float32
property_tax_BRL      float32
fire_insurance_BRL    float32
city_1                float32
city_2                float32
city_3                float32
city_4                float32
animal_1              float32
furniture_1           float32
dtype: object
Task 361099




month           float32
hour            float32
temp            float32
feel_temp       float32
humidity        float32
windspeed       float32
season_1        float32
season_2        float32
season_3        float32
year_1          float32
workingday_1    float32
dtype: object
Task 361101




tolls_amount                    float32
total_amount                    float32
lpep_pickup_datetime_day        float32
lpep_pickup_datetime_hour       float32
lpep_pickup_datetime_minute     float32
lpep_dropoff_datetime_day       float32
lpep_dropoff_datetime_hour      float32
lpep_dropoff_datetime_minute    float32
dtype: object
Task 361102




bedrooms         float32
bathrooms        float32
sqft_living      float32
sqft_lot         float32
grade            float32
sqft_above       float32
sqft_basement    float32
yr_built         float32
yr_renovated     float32
lat              float32
long             float32
sqft_living15    float32
sqft_lot15       float32
date_month       float32
date_day         float32
date_year_1      float32
dtype: object
Task 361103




Hour                                         float32
Altitudem                                    float32
PMsub25subparticulatematterHourlymeasured    float32
Month_1                                      float32
Month_10                                     float32
Month_11                                     float32
Month_2                                      float32
Month_3                                      float32
Month_4                                      float32
Month_5                                      float32
Month_6                                      float32
Month_7                                      float32
Month_8                                      float32
Month_9                                      float32
DayofWeek_1                                  float32
DayofWeek_2                                  float32
DayofWeek_3                                  float32
DayofWeek_4                                  float32
DayofWeek_5                                  f



Run2      float32
Run3      float32
Run4      float32
KWG_1     float32
KWI_1     float32
STRM_1    float32
STRN_1    float32
SA_1      float32
SB_1      float32
dtype: object
Task 361287




oz1        float32
oz2        float32
oz3        float32
oz4        float32
oz5        float32
            ...   
oz259      float32
oz261      float32
oz262      float32
oz264      float32
oz265_1    float32
Length: 253, dtype: object
Task 361288




Length            float32
Diameter          float32
Height            float32
Whole_weight      float32
Shucked_weight    float32
Viscera_weight    float32
Shell_weight      float32
Sex_1             float32
Sex_2             float32
dtype: object
Task 361289




Occurred_hour    float32
Occurred_min     float32
dtype: object
Task 361291




Exception: File: C:\Users\dalma\.openml\org\openml\www\datasets\45045\dataset_45045.pq

In [2]:
X.dtypes

Actions_taken       float32
Year_of_decision    float32
Liberal_1           float32
Unanimous_1         float32
dtype: object

In [6]:
task_id = 361287
N_TRIALS=100
N_SAMPLES=100
PATIENCE=40
N_EPOCHS=1000

BATCH_SIZE=1024
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)

CHECKPOINT_PATH = f'CHECKPOINTS/GOWER/task_{task_id}.pt'

print(f"Task {task_id}")

task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

Task 361287




In [7]:
X.dtypes

oz1       float64
oz2       float64
oz3       float64
oz4       float64
oz5       float64
           ...   
oz260    category
oz261     float64
oz262     float64
oz264     float64
oz265    category
Length: 255, dtype: object

In [8]:
categorical_indicator

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


In [9]:
X

Unnamed: 0,oz1,oz2,oz3,oz4,oz5,oz6,oz7,oz8,oz9,oz10,...,oz254,oz256,oz257,oz258,oz259,oz260,oz261,oz262,oz264,oz265
0,0.106112,0.153159,0.533333,0.177273,0.164345,0.180812,0.188449,0.171053,0.156839,0.366485,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
1,0.106112,0.099068,0.178231,0.181818,0.145179,0.184502,0.181485,0.184211,0.188336,0.620572,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2,0.106112,0.120172,0.317007,0.118182,0.096524,0.110701,0.108506,0.105263,0.102034,0.301090,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
3,0.089232,0.087194,0.193878,0.095455,0.061344,0.092251,0.081890,0.092105,0.081679,0.274523,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
4,0.111846,0.118452,0.271429,0.150000,0.119729,0.143911,0.153152,0.118421,0.164176,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8880,0.455253,0.394826,0.284354,0.527273,0.351773,0.564576,0.442494,0.631579,0.456440,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8881,0.129336,0.161179,0.425850,0.245455,0.226020,0.254613,0.281939,0.263158,0.241231,0.608311,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8882,0.111846,0.155457,0.505442,0.204545,0.209552,0.201107,0.244162,0.197368,0.205962,0.561308,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
8883,0.129336,0.179882,0.531973,0.236364,0.223691,0.239852,0.236435,0.236842,0.214358,0.437330,...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0


In [12]:
X['oz261'].unique_values()

AttributeError: 'Series' object has no attribute 'unique_values'

In [13]:
pd.unique(X['oz261'])

array([0.      , 0.008214, 0.00616 , 0.01232 , 0.026694, 0.024641,
       0.045175, 0.188912, 0.016427, 0.028747, 0.010267, 0.049281,
       0.098563, 0.014374, 0.100616, 0.106776, 0.020534, 0.047228,
       0.057495, 0.032854, 0.685832, 0.022587, 0.030801, 0.36961 ,
       0.13963 , 0.041068, 0.104723, 0.143737, 0.078029, 0.110883,
       0.01848 , 0.102669, 0.071869, 0.390144, 0.053388, 0.542094,
       0.205339, 0.332649, 0.043121, 0.039014, 0.088296, 0.075975,
       0.137577, 0.123203, 0.283368, 0.193018, 0.065708, 0.063655,
       0.147844, 0.082136, 0.244353, 1.      , 0.184805, 0.036961,
       0.094456, 0.059548, 0.131417, 0.151951, 0.092402, 0.229979,
       0.201232, 0.051335, 0.11499 , 0.069815, 0.156057, 0.061602,
       0.119097, 0.135524, 0.073922])

In [14]:
X['oz261'].value_counts(normalize=True)

oz261
0.000000    0.942375
0.008214    0.012606
0.006160    0.010467
0.012320    0.005853
0.024641    0.003376
              ...   
0.137577    0.000113
0.123203    0.000113
0.283368    0.000113
0.063655    0.000113
0.073922    0.000113
Name: proportion, Length: 69, dtype: float64