In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule, E2EResNet, StagewiseRandFeatBoostRegression
from optuna_kfoldCV import evaluate_dataset_with_model, run_all_openML_with_model
from regression_param_specs import evaluate_GRFBoost, evaluate_Ridge, evaluate_XGBoostRegressor

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [2]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )

    #count missing values in X
    missing_values_count = X.isnull().sum().sum()
    print(f"Missing values in X: {missing_values_count}")

    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical,
        'n_missing_values': missing_values_count,
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

Missing values in X: 0
(4177, 8)
(4177, 1)
 1/35 Processed dataset 44956: abalone
Missing values in X: 0
(1503, 5)
(1503, 1)
 2/35 Processed dataset 44957: airfoil_self_noise
Missing values in X: 0
(2043, 7)
(2043, 1)
 3/35 Processed dataset 44958: auction_verification
Missing values in X: 0
(1030, 8)
(1030, 1)
 4/35 Processed dataset 44959: concrete_compressive_strength
Missing values in X: 0
(45730, 9)
(45730, 1)
 5/35 Processed dataset 44963: physiochemical_protein
Missing values in X: 0
(21263, 81)
(21263, 1)
 6/35 Processed dataset 44964: superconductivity
Missing values in X: 0
(1059, 116)
(1059, 1)
 7/35 Processed dataset 44965: geographical_origin_of_music
Missing values in X: 0
(1066, 10)
(1066, 1)
 8/35 Processed dataset 44966: solar_flare
Missing values in X: 0
(11934, 14)
(11934, 1)
 9/35 Processed dataset 44969: naval_propulsion_plant
Missing values in X: 0
(4898, 11)
(4898, 1)
 10/35 Processed dataset 44971: white_wine
Missing values in X: 0
(1599, 11)
(1599, 1)
 11/35 Pr

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical,n_missing_values
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
41021,Moneyball,1232,15,0.303571,374,True,3600
44956,abalone,4177,9,0.006703,28,True,0
44957,airfoil_self_noise,1503,6,0.968729,1456,False,0
44958,auction_verification,2043,8,0.998042,2039,True,0
44959,concrete_compressive_strength,1030,9,0.91068,938,False,0
44960,energy_efficiency,768,9,0.764323,587,False,0
44962,forest_fires,517,13,0.485493,251,True,0
44963,physiochemical_protein,45730,10,0.347759,15903,False,0
44964,superconductivity,21263,82,0.141419,3007,False,0
44965,geographical_origin_of_music,1059,117,0.029273,31,False,0


In [3]:
dataset_ids_no_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_no_categorical = sorted([int(x) for x in dataset_ids_no_categorical])
len(dataset_ids_no_categorical)
dataset_ids_no_categorical

[44957,
 44959,
 44960,
 44963,
 44964,
 44965,
 44969,
 44970,
 44971,
 44972,
 44973,
 44975,
 44976,
 44977,
 44978,
 44980,
 44981,
 44983,
 44994,
 45402]

# Run experiments (just for testing)

In [4]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_Ridge,
    name_model="Ridge",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=50,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-24 19:46:24,665] A new study created in memory with name: no-name-be5af6b5-db14-41e2-b95f-fa49cf0e7994
[I 2024-11-24 19:46:24,678] Trial 0 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 0.00014552383013554754}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-24 19:46:24,685] Trial 1 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 3.371653053764772e-05}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-24 19:46:24,690] Trial 2 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 1.0174402027027259e-05}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-24 19:46:24,695] Trial 3 finished with value: 0.6950941920280457 and parameters: {'l2_reg': 0.0002802311138180538}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-24 19:46:24,703] Trial 4 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 2.0914813240683054e-06}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-

 1/2 Processed dataset 44957


[I 2024-11-24 19:46:27,367] Trial 25 finished with value: 0.6031849265098572 and parameters: {'l2_reg': 0.004849705845417992}. Best is trial 11 with value: 0.6031803607940673.
[I 2024-11-24 19:46:27,394] Trial 26 finished with value: 0.6031851172447205 and parameters: {'l2_reg': 0.001120206505332291}. Best is trial 11 with value: 0.6031803607940673.
[I 2024-11-24 19:46:27,424] Trial 27 finished with value: 0.6031832695007324 and parameters: {'l2_reg': 0.03802487701547936}. Best is trial 11 with value: 0.6031803607940673.
[I 2024-11-24 19:46:27,446] Trial 28 finished with value: 0.603184723854065 and parameters: {'l2_reg': 0.008731019506062648}. Best is trial 11 with value: 0.6031803607940673.
[I 2024-11-24 19:46:27,463] Trial 29 finished with value: 0.6031851768493652 and parameters: {'l2_reg': 1.5661424008083162e-05}. Best is trial 11 with value: 0.6031803607940673.
[I 2024-11-24 19:46:27,478] Trial 30 finished with value: 0.6031836271286011 and parameters: {'l2_reg': 0.02959391630803

 2/2 Processed dataset 44959


{'44957': {'Ridge': {'score_train': [0.6904380321502686,
    0.6888523101806641,
    0.690494954586029,
    0.6793413758277893,
    0.6790646910667419],
   'score_test': [0.6714305281639099,
    0.6768062710762024,
    0.6710960268974304,
    0.7160691618919373,
    0.7178903818130493],
   't_fit': [0.0008225209967349656,
    0.0007326819977606647,
    0.0006001869987812825,
    0.0007846649968996644,
    0.0006251939994399436],
   't_inference': [2.9670001822523773e-05,
    4.5535998651757836e-05,
    2.634900010889396e-05,
    3.233600000385195e-05,
    3.9145998016465455e-05],
   'hyperparams': [{'l2_reg': 0.09996579083173561},
    {'l2_reg': 0.0993482987597182},
    {'l2_reg': 0.09955265792249261},
    {'l2_reg': 0.09793907046663583},
    {'l2_reg': 0.0993831456972149}]}},
 '44959': {'Ridge': {'score_train': [0.5919756889343262,
    0.5584654808044434,
    0.5737619400024414,
    0.5931830406188965,
    0.5818951725959778],
   'score_test': [0.5422599911689758,
    0.67038035392761

In [5]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_GRFBoost,
    name_model="GRFBoost",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=50,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-24 19:46:29,648] A new study created in memory with name: no-name-0ac5a302-e7cb-4d7d-94e1-7cb0d93f6aa0
[I 2024-11-24 19:46:29,865] Trial 0 finished with value: 0.5100523412227631 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 72, 'bottleneck_dim': 65, 'n_layers': 2, 'l2_reg': 0.03708426232915256, 'boost_lr': 0.560317869487389}. Best is trial 0 with value: 0.5100523412227631.
[I 2024-11-24 19:46:29,920] Trial 1 finished with value: 0.672361958026886 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 58, 'bottleneck_dim': 117, 'n_layers': 1, 'l2_reg': 0.05201398614001344, 'boost_lr': 0.3402186362542897}. Best is trial 0 with value: 0.5100523412227631.
[I 2024-11-24 19:46:29,952] Trial 2 finished with value: 0.621610426902771 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 7, 'bottleneck_dim': 81, 'n_layers': 1, 'l2_reg': 0.0044139048150050484, 'boost_lr': 0.

 1/2 Processed dataset 44957


[I 2024-11-24 19:49:23,577] Trial 1 finished with value: 0.40877466201782225 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 81, 'bottleneck_dim': 71, 'n_layers': 20, 'l2_reg': 0.047163285041506395, 'boost_lr': 0.5371224898525647}. Best is trial 0 with value: 0.40447619557380676.
[I 2024-11-24 19:49:23,640] Trial 2 finished with value: 0.4659204721450806 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 9, 'bottleneck_dim': 118, 'n_layers': 3, 'l2_reg': 1.5981803488170246e-06, 'boost_lr': 0.11691832408806457}. Best is trial 0 with value: 0.40447619557380676.
[I 2024-11-24 19:49:23,695] Trial 3 finished with value: 0.45696598291397095 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 15, 'bottleneck_dim': 64, 'n_layers': 3, 'l2_reg': 1.056732381197542e-05, 'boost_lr': 0.6892311440150917}. Best is trial 0 with value: 0.40447619557380676.
[I 2024-11-24 19:49:23,724] Trial

 2/2 Processed dataset 44959


{'44957': {'GRFBoost': {'score_train': [0.3855682611465454,
    0.4249568581581116,
    0.40754759311676025,
    0.385372132062912,
    0.3511205315589905],
   'score_test': [0.3958364427089691,
    0.4429141879081726,
    0.4208241403102875,
    0.4325748682022095,
    0.41912660002708435],
   't_fit': [0.31405893599730916,
    0.137850911996793,
    0.47334874299849616,
    0.20859759800077882,
    0.3554881740055862],
   't_inference': [0.023381550003250595,
    0.008082480002485681,
    0.03352681500109611,
    0.017155818997707684,
    0.02257360799558228],
   'hyperparams': [{'out_dim': 1,
     'feature_type': 'SWIM',
     'upscale': 'dense',
     'hidden_dim': 94,
     'bottleneck_dim': 127,
     'n_layers': 32,
     'l2_reg': 0.00024209286367691364,
     'boost_lr': 0.40526641471091535},
    {'out_dim': 1,
     'feature_type': 'SWIM',
     'upscale': 'dense',
     'hidden_dim': 100,
     'bottleneck_dim': 98,
     'n_layers': 15,
     'l2_reg': 0.00260945652434937,
     'boost_

In [6]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_XGBoostRegressor,
    name_model="XGBoost",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=50,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-24 19:52:05,835] A new study created in memory with name: no-name-90ec5cbd-a142-4961-91b6-b2c07cc149ee


[I 2024-11-24 19:52:06,577] Trial 0 finished with value: 0.3211060523986816 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.4617797097916705, 'learning_rate': 0.2698657694588828, 'n_estimators': 280, 'max_depth': 10, 'subsample': 0.6486526548636167, 'colsample_bytree': 0.5180372033272707}. Best is trial 0 with value: 0.3211060523986816.
[I 2024-11-24 19:52:07,085] Trial 1 finished with value: 0.23578083217144014 and parameters: {'objective': 'reg:squarederror', 'lambda': 3.2786338369759638, 'learning_rate': 0.2072046098292768, 'n_estimators': 181, 'max_depth': 9, 'subsample': 0.8825628456762293, 'colsample_bytree': 0.9219312640439705}. Best is trial 1 with value: 0.23578083217144014.
[I 2024-11-24 19:52:07,570] Trial 2 finished with value: 0.3044317960739136 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.0029700295915025504, 'learning_rate': 0.28827992538791447, 'n_estimators': 165, 'max_depth': 10, 'subsample': 0.5094043070377109, 'colsample_bytree': 0.632

 1/2 Processed dataset 44957


[I 2024-11-24 19:54:08,253] Trial 0 finished with value: 0.27470452785491944 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.06738100931702286, 'learning_rate': 0.0936496037535095, 'n_estimators': 172, 'max_depth': 5, 'subsample': 0.5801380244899117, 'colsample_bytree': 0.7941505796435488}. Best is trial 0 with value: 0.27470452785491944.
[I 2024-11-24 19:54:09,655] Trial 1 finished with value: 0.3099649906158447 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.002603355526267849, 'learning_rate': 0.02180932481187994, 'n_estimators': 289, 'max_depth': 8, 'subsample': 0.758136350998588, 'colsample_bytree': 0.5516133626094021}. Best is trial 0 with value: 0.27470452785491944.
[I 2024-11-24 19:54:10,055] Trial 2 finished with value: 0.2734390288591385 and parameters: {'objective': 'reg:squarederror', 'lambda': 4.098913991330432, 'learning_rate': 0.28757214125374647, 'n_estimators': 173, 'max_depth': 5, 'subsample': 0.80977789284589, 'colsample_bytree': 0.7990489

 2/2 Processed dataset 44959


{'44957': {'XGBoost': {'score_train': [0.060171518474817276,
    0.06686989963054657,
    0.07503871619701385,
    0.0744931548833847,
    0.0881117731332779],
   'score_test': [0.20337547361850739,
    0.18778125941753387,
    0.22138790786266327,
    0.2409968227148056,
    0.24799378216266632],
   't_fit': [0.1219685549949645,
    0.13496758799738018,
    0.1705539519971353,
    0.05931845199665986,
    0.09367380099865841],
   't_inference': [0.004859751999902073,
    0.006807538004068192,
    0.010431914997752756,
    0.0024921549993450753,
    0.0038661970029352233],
   'hyperparams': [{'objective': 'reg:squarederror',
     'lambda': 4.108602592245915,
     'learning_rate': 0.15917714189659568,
     'n_estimators': 241,
     'max_depth': 9,
     'subsample': 0.7613727768444187,
     'colsample_bytree': 0.9435869119879303},
    {'objective': 'reg:squarederror',
     'lambda': 0.41911991517761277,
     'learning_rate': 0.09267352399641375,
     'n_estimators': 290,
     'max_depth'

In [7]:
# TODO change to fit and fit_transform
# TODO make code for classification
# TODO make all models for reg
# TODO make optuna thing for END2END
# TODO ABLATION STUDY experiement with default f(x_t) f( [x_t, x_0] ), [f(x_t), x_0], and [ f_1(x_t) f_2(x_0)]
# TODO # or do i want a json/big array?     results[rmse_test][model][dataset][fold]. could work minus the parameters