In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule, E2EResNet, StagewiseRandFeatBoostRegression

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [2]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

(4177, 8)
(4177, 1)
 1/35 Processed dataset 44956: abalone
(1503, 5)
(1503, 1)
 2/35 Processed dataset 44957: airfoil_self_noise
(2043, 7)
(2043, 1)
 3/35 Processed dataset 44958: auction_verification
(1030, 8)
(1030, 1)
 4/35 Processed dataset 44959: concrete_compressive_strength
(45730, 9)
(45730, 1)
 5/35 Processed dataset 44963: physiochemical_protein
(21263, 81)
(21263, 1)
 6/35 Processed dataset 44964: superconductivity
(1059, 116)
(1059, 1)
 7/35 Processed dataset 44965: geographical_origin_of_music
(1066, 10)
(1066, 1)
 8/35 Processed dataset 44966: solar_flare
(11934, 14)
(11934, 1)
 9/35 Processed dataset 44969: naval_propulsion_plant
(4898, 11)
(4898, 1)
 10/35 Processed dataset 44971: white_wine
(1599, 11)
(1599, 1)
 11/35 Processed dataset 44972: red_wine
(10000, 12)
(10000, 1)
 12/35 Processed dataset 44973: grid_stability
(68784, 18)
(68784, 1)
 13/35 Processed dataset 44974: video_transcoding
(72000, 48)
(72000, 1)
 14/35 Processed dataset 44975: wave_energy
(48933, 21)

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41021,Moneyball,1232,15,0.303571,374,True
44956,abalone,4177,9,0.006703,28,True
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44958,auction_verification,2043,8,0.998042,2039,True
44959,concrete_compressive_strength,1030,9,0.91068,938,False
44960,energy_efficiency,768,9,0.764323,587,False
44962,forest_fires,517,13,0.485493,251,True
44963,physiochemical_protein,45730,10,0.347759,15903,False
44964,superconductivity,21263,82,0.141419,3007,False
44965,geographical_origin_of_music,1059,117,0.029273,31,False


In [3]:
dataset_ids_no_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_no_categorical = sorted([int(x) for x in dataset_ids_no_categorical])
len(dataset_ids_no_categorical)

20

# Run experiments

In [6]:
from openMLReg import run_all_openMLreg_with_model, evaluate_Ridge

run_all_openMLreg_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_Ridge, 
    "Ridge_OpenML_reg.pkl",
    )

[I 2024-11-23 22:04:19,764] A new study created in memory with name: no-name-bf239016-fb16-4249-befe-fa10fc797e2f
[I 2024-11-23 22:04:19,865] Trial 0 finished with value: 0.6950936555862427 and parameters: {'l2_reg': 0.047776288119608205}. Best is trial 0 with value: 0.6950936555862427.
[I 2024-11-23 22:04:19,870] Trial 1 finished with value: 0.6950941920280457 and parameters: {'l2_reg': 3.948907536710273e-07}. Best is trial 0 with value: 0.6950936555862427.
[I 2024-11-23 22:04:19,874] Trial 2 finished with value: 0.6950939655303955 and parameters: {'l2_reg': 0.020255126262903718}. Best is trial 0 with value: 0.6950936555862427.
[I 2024-11-23 22:04:19,879] Trial 3 finished with value: 0.6950942039489746 and parameters: {'l2_reg': 5.8662597716318906e-05}. Best is trial 0 with value: 0.6950936555862427.
[I 2024-11-23 22:04:19,884] Trial 4 finished with value: 0.6950942039489746 and parameters: {'l2_reg': 8.5384322515243e-05}. Best is trial 0 with value: 0.6950936555862427.
[I 2024-11-23 

 1/2 Processed dataset 44957


[I 2024-11-23 22:04:27,596] Trial 14 finished with value: 0.6031813979148865 and parameters: {'l2_reg': 0.07674594726851662}. Best is trial 14 with value: 0.6031813979148865.
[I 2024-11-23 22:04:27,606] Trial 15 finished with value: 0.6031850218772888 and parameters: {'l2_reg': 0.0039806119780028335}. Best is trial 14 with value: 0.6031813979148865.
[I 2024-11-23 22:04:27,619] Trial 16 finished with value: 0.6031852126121521 and parameters: {'l2_reg': 1.1764730015364378e-05}. Best is trial 14 with value: 0.6031813979148865.
[I 2024-11-23 22:04:27,631] Trial 17 finished with value: 0.6031805634498596 and parameters: {'l2_reg': 0.09571032396890651}. Best is trial 17 with value: 0.6031805634498596.
[I 2024-11-23 22:04:27,647] Trial 18 finished with value: 0.6031852126121521 and parameters: {'l2_reg': 1.259978144017256e-07}. Best is trial 17 with value: 0.6031805634498596.
[I 2024-11-23 22:04:27,657] Trial 19 finished with value: 0.6031847834587097 and parameters: {'l2_reg': 0.008415971404

 2/2 Processed dataset 44959


In [7]:
df_reg = pd.read_pickle("XGBoost_OpenML_reg.pkl")
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,XGBoost
44957,"[0.20223756, 0.18210493, 0.23075196, 0.2390833..."
44959,"[0.2511126, 0.28547558, 0.27461478, 0.25445154..."


In [8]:
df_reg = pd.read_pickle("GRFBoost_OpenML_reg.pkl")
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,GRFBoost
44957,"[0.42312362790107727, 0.38834860920906067, 0.3..."
44959,"[0.3569311201572418, 0.4054701328277588, 0.371..."


In [9]:
df_reg = pd.read_pickle("Ridge_OpenML_reg.pkl")
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,GRFBoost
44957,"[0.6714304685592651, 0.6768062710762024, 0.671..."
44959,"[0.5422600507736206, 0.6703803539276123, 0.613..."


In [10]:
df_reg = pd.read_pickle("RidgeCV_OpenML_reg.pkl")
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,GRFBoost
44957,"[0.6714304089546204, 0.6768071055412292, 0.671..."
44959,"[0.5422599911689758, 0.6703803539276123, 0.613..."


In [None]:
# TODO change to fit and fit_transform
# TODO make code for classification
# TODO make all models for reg
# TODO experiement with f(x_t, x_0), and [f(x_t), x_0], and the default f(x_t) ABLATION STUDY
# TODO # or do i want a json/big array?     results[rmse_test][model][dataset][fold]. could work minus the parameters