In [4]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import pandas as pd
import openml

from optuna_kfoldCV import evaluate_dataset_with_model, run_all_openML_with_model
from regression_param_specs import evaluate_GRFBoost, evaluate_Ridge, evaluate_XGBoostRegressor

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [3]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )

    #count missing values in X
    missing_values_count = X.isnull().sum().sum()
    print(f"Missing values in X: {missing_values_count}")

    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical,
        'n_missing_values': missing_values_count,
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

Missing values in X: 0
(4177, 8)
(4177, 1)
 1/35 Processed dataset 44956: abalone
Missing values in X: 0
(1503, 5)
(1503, 1)
 2/35 Processed dataset 44957: airfoil_self_noise
Missing values in X: 0
(2043, 7)
(2043, 1)
 3/35 Processed dataset 44958: auction_verification
Missing values in X: 0
(1030, 8)
(1030, 1)
 4/35 Processed dataset 44959: concrete_compressive_strength
Missing values in X: 0
(45730, 9)
(45730, 1)
 5/35 Processed dataset 44963: physiochemical_protein
Missing values in X: 0
(21263, 81)
(21263, 1)
 6/35 Processed dataset 44964: superconductivity
Missing values in X: 0
(1059, 116)
(1059, 1)
 7/35 Processed dataset 44965: geographical_origin_of_music
Missing values in X: 0
(1066, 10)
(1066, 1)
 8/35 Processed dataset 44966: solar_flare
Missing values in X: 0
(11934, 14)
(11934, 1)
 9/35 Processed dataset 44969: naval_propulsion_plant
Missing values in X: 0
(4898, 11)
(4898, 1)
 10/35 Processed dataset 44971: white_wine
Missing values in X: 0
(1599, 11)
(1599, 1)
 11/35 Pr

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical,n_missing_values
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
41021,Moneyball,1232,15,0.303571,374,True,3600
44956,abalone,4177,9,0.006703,28,True,0
44957,airfoil_self_noise,1503,6,0.968729,1456,False,0
44958,auction_verification,2043,8,0.998042,2039,True,0
44959,concrete_compressive_strength,1030,9,0.91068,938,False,0
44960,energy_efficiency,768,9,0.764323,587,False,0
44962,forest_fires,517,13,0.485493,251,True,0
44963,physiochemical_protein,45730,10,0.347759,15903,False,0
44964,superconductivity,21263,82,0.141419,3007,False,0
44965,geographical_origin_of_music,1059,117,0.029273,31,False,0


In [3]:
dataset_ids_no_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_no_categorical = sorted([int(x) for x in dataset_ids_no_categorical])
len(dataset_ids_no_categorical)
dataset_ids_no_categorical

[44957,
 44959,
 44960,
 44963,
 44964,
 44965,
 44969,
 44970,
 44971,
 44972,
 44973,
 44975,
 44976,
 44977,
 44978,
 44980,
 44981,
 44983,
 44994,
 45402]

# Run experiments (just for testing)

In [4]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_Ridge,
    name_model="Ridge",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=2,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-25 11:03:37,696] A new study created in memory with name: no-name-7e6c082b-99a0-4ca1-b92e-b5429f9b02db
[I 2024-11-25 11:03:37,731] Trial 0 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 4.770460224451055e-07}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-25 11:03:37,738] Trial 1 finished with value: 0.6950941801071167 and parameters: {'l2_reg': 1.4210086053412847e-05}. Best is trial 0 with value: 0.6950941801071167.
[I 2024-11-25 11:03:37,742] A new study created in memory with name: no-name-bc6d3322-924f-47fb-8065-a75d6ea56618
[I 2024-11-25 11:03:37,747] Trial 0 finished with value: 0.6902536392211914 and parameters: {'l2_reg': 0.00017669688248924572}. Best is trial 0 with value: 0.6902536392211914.
[I 2024-11-25 11:03:37,758] Trial 1 finished with value: 0.6902536273002624 and parameters: {'l2_reg': 3.692257350103396e-07}. Best is trial 1 with value: 0.6902536273002624.
[I 2024-11-25 11:03:37,760] A new study created in memory with name: n

 1/2 Processed dataset 44957
 2/2 Processed dataset 44959


{'44957': {'Ridge': {'score_train': [0.6904380321502686,
    0.6888523101806641,
    0.690494954586029,
    0.6793413758277893,
    0.6790646910667419],
   'score_test': [0.6714187264442444,
    0.6768081784248352,
    0.6710893511772156,
    0.7160781025886536,
    0.7178991436958313],
   't_fit': [0.0014943280002626125,
    0.0007019670001682243,
    0.0008961670000644517,
    0.00687843800005794,
    0.0005899299994780449],
   't_inference': [8.147500011546072e-05,
    6.105400007072603e-05,
    6.80860002830741e-05,
    7.047199960652506e-05,
    4.3855000512849074e-05],
   'hyperparams': [{'l2_reg': 4.770460224451055e-07},
    {'l2_reg': 3.692257350103396e-07},
    {'l2_reg': 0.012303607322394005},
    {'l2_reg': 1.0422206015206492e-05},
    {'l2_reg': 0.00042002862208719694}]}},
 '44959': {'Ridge': {'score_train': [0.5919755101203918,
    0.5584654808044434,
    0.5737617611885071,
    0.5931828022003174,
    0.5818951725959778],
   'score_test': [0.5422646403312683,
    0.670380

In [5]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_GRFBoost,
    name_model="GRFBoost",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=2,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-25 11:03:37,985] A new study created in memory with name: no-name-add3a45d-ab4d-4687-adeb-43b13e38bf25
[I 2024-11-25 11:03:38,154] Trial 0 finished with value: 0.502512264251709 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 7, 'bottleneck_dim': 105, 'n_layers': 6, 'l2_reg': 0.0002456949199034808, 'boost_lr': 0.7202596033157238}. Best is trial 0 with value: 0.502512264251709.
[I 2024-11-25 11:03:38,277] Trial 1 finished with value: 0.5018909752368927 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 5, 'bottleneck_dim': 97, 'n_layers': 6, 'l2_reg': 0.02898469778422591, 'boost_lr': 0.18768074232656476}. Best is trial 1 with value: 0.5018909752368927.
[I 2024-11-25 11:03:38,320] A new study created in memory with name: no-name-c34009ce-6167-4ecc-90c0-d496b044184b
[I 2024-11-25 11:03:38,430] Trial 0 finished with value: 0.5110386133193969 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upsc

 1/2 Processed dataset 44957


[I 2024-11-25 11:03:40,100] Trial 0 finished with value: 0.4341720461845398 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 90, 'bottleneck_dim': 66, 'n_layers': 11, 'l2_reg': 0.002916384318351732, 'boost_lr': 0.31787767193090405}. Best is trial 0 with value: 0.4341720461845398.
[I 2024-11-25 11:03:40,176] Trial 1 finished with value: 0.43174594044685366 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 31, 'bottleneck_dim': 91, 'n_layers': 4, 'l2_reg': 4.1839106172508574e-05, 'boost_lr': 0.18883815575792723}. Best is trial 1 with value: 0.43174594044685366.
[I 2024-11-25 11:03:40,196] A new study created in memory with name: no-name-a9263743-9674-422c-b740-e1cf8bea6a68
[I 2024-11-25 11:03:41,163] Trial 0 finished with value: 0.3975066184997559 and parameters: {'out_dim': 1, 'feature_type': 'SWIM', 'upscale': 'dense', 'hidden_dim': 58, 'bottleneck_dim': 124, 'n_layers': 31, 'l2_reg': 0.0002871676970166237, 

 2/2 Processed dataset 44959


{'44957': {'GRFBoost': {'score_train': [0.47585028409957886,
    0.4695267677307129,
    0.4265369474887848,
    0.45623016357421875,
    0.43326061964035034],
   'score_test': [0.4703754782676697,
    0.47517815232276917,
    0.4415760636329651,
    0.5007573366165161,
    0.4916688799858093],
   't_fit': [0.0343653740001173,
    0.014975527999922633,
    0.10601014499934536,
    0.018334454999603622,
    0.056702144000155386],
   't_inference': [0.0033401739992768853,
    0.0010968409997076378,
    0.00960470400059421,
    0.0011972720003541326,
    0.0038464350000140257],
   'hyperparams': [{'out_dim': 1,
     'feature_type': 'SWIM',
     'upscale': 'dense',
     'hidden_dim': 5,
     'bottleneck_dim': 97,
     'n_layers': 6,
     'l2_reg': 0.02898469778422591,
     'boost_lr': 0.18768074232656476},
    {'out_dim': 1,
     'feature_type': 'SWIM',
     'upscale': 'dense',
     'hidden_dim': 39,
     'bottleneck_dim': 70,
     'n_layers': 2,
     'l2_reg': 0.0009039930010732471,
     

In [6]:
run_all_openML_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_XGBoostRegressor,
    name_model="XGBoost",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="regression",
    n_optuna_trials=2,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-25 11:03:43,523] A new study created in memory with name: no-name-c44e772d-6b36-4c73-bcf3-04577d7f9148
[I 2024-11-25 11:03:43,886] Trial 0 finished with value: 0.6167511105537414 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.05452469009316798, 'learning_rate': 0.012785943639332416, 'n_estimators': 86, 'max_depth': 9, 'subsample': 0.8661561259409865, 'colsample_bytree': 0.6675369549054682}. Best is trial 0 with value: 0.6167511105537414.
[I 2024-11-25 11:03:44,176] Trial 1 finished with value: 0.2555988609790802 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.06436044092893864, 'learning_rate': 0.10548019940205561, 'n_estimators': 179, 'max_depth': 6, 'subsample': 0.7874459483961107, 'colsample_bytree': 0.9359840421044765}. Best is trial 1 with value: 0.2555988609790802.
[I 2024-11-25 11:03:44,241] A new study created in memory with name: no-name-a0164860-f388-470f-a70c-3d665cf1d01b
[I 2024-11-25 11:03:45,087] Trial 0 finished with value: 0.29454

 1/2 Processed dataset 44957


[I 2024-11-25 11:03:49,866] Trial 0 finished with value: 0.3194282233715057 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.027486172826441117, 'learning_rate': 0.20624709869755437, 'n_estimators': 72, 'max_depth': 7, 'subsample': 0.7423580745094693, 'colsample_bytree': 0.5221873711491241}. Best is trial 0 with value: 0.3194282233715057.
[I 2024-11-25 11:03:50,061] Trial 1 finished with value: 0.3729180932044983 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.023369680377439195, 'learning_rate': 0.04189406062406382, 'n_estimators': 63, 'max_depth': 4, 'subsample': 0.6975529369693614, 'colsample_bytree': 0.7752277105870615}. Best is trial 0 with value: 0.3194282233715057.
[I 2024-11-25 11:03:50,121] A new study created in memory with name: no-name-a35fbc2f-799e-430a-a211-26b1abff6595
[I 2024-11-25 11:03:50,360] Trial 0 finished with value: 0.26703527867794036 and parameters: {'objective': 'reg:squarederror', 'lambda': 0.009568241003476868, 'learning_rate': 0.

 2/2 Processed dataset 44959


{'44957': {'XGBoost': {'score_train': [0.09720489382743835,
    0.12964589893817902,
    0.061221685260534286,
    0.03712787106633186,
    0.1342189610004425],
   'score_test': [0.23469944298267365,
    0.2275925874710083,
    0.23300482332706451,
    0.24160225689411163,
    0.30334922671318054],
   't_fit': [0.057530427000529016,
    0.23463043599986122,
    0.1763638810007251,
    0.16610003000005236,
    0.036808136999752605],
   't_inference': [0.002540259999477712,
    0.006157441000141262,
    0.0060171789991727564,
    0.004987080000319111,
    0.0024545870001020376],
   'hyperparams': [{'objective': 'reg:squarederror',
     'lambda': 0.06436044092893864,
     'learning_rate': 0.10548019940205561,
     'n_estimators': 179,
     'max_depth': 6,
     'subsample': 0.7874459483961107,
     'colsample_bytree': 0.9359840421044765},
    {'objective': 'reg:squarederror',
     'lambda': 0.005610204576382474,
     'learning_rate': 0.01571876388948241,
     'n_estimators': 194,
     'max

In [None]:
# TODO ABLATION STUDY experiement with default f(x_t) f( [x_t, x_0] ), [f(x_t), x_0], and [ f_1(x_t) f_2(x_0)]
# TODO ablation study for scalar/diag/dense DELTA
# TODO GradientRFBoost for binary classification

Regression Models

* XGBoost (trees)
* Ridge Regression
* E2E ResNet

for feat_type in [SWIM, Gaussian iid]:
* Random Feature ResNet
* Greedy RandFeatBoost
* Gradient RandFeatBoost
* (optional) Greedy SGD ResNetBoost

Classification differences:

* Ridge -> Logistic Regression
* No Greeedy RFBoost

# SCALAR VS DIAG VS DENSE

only works for regression. Should i have a separate graph for this?