In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule, E2EResNet

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [2]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id")
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

 1/35 Processed dataset 44956: abalone
 2/35 Processed dataset 44957: airfoil_self_noise
 3/35 Processed dataset 44958: auction_verification
 4/35 Processed dataset 44959: concrete_compressive_strength
 5/35 Processed dataset 44963: physiochemical_protein
 6/35 Processed dataset 44964: superconductivity
 7/35 Processed dataset 44965: geographical_origin_of_music
 8/35 Processed dataset 44966: solar_flare
 9/35 Processed dataset 44969: naval_propulsion_plant
 10/35 Processed dataset 44971: white_wine
 11/35 Processed dataset 44972: red_wine
 12/35 Processed dataset 44973: grid_stability
 13/35 Processed dataset 44974: video_transcoding
 14/35 Processed dataset 44975: wave_energy
 15/35 Processed dataset 44976: sarcos
 16/35 Processed dataset 44977: california_housing
 17/35 Processed dataset 44978: cpu_activity
 18/35 Processed dataset 44979: diamonds
 19/35 Processed dataset 44980: kin8nm
 20/35 Processed dataset 44981: pumadyn32nh
 21/35 Processed dataset 44983: miami_housing
 22/35 P

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
44973,grid_stability,10000,13,1.0,10000,False
44975,wave_energy,72000,49,0.999903,71993,False
44980,kin8nm,8192,9,0.999878,8191,False
44981,pumadyn32nh,8192,33,0.999878,8191,False
45402,space_ga,3107,7,0.999356,3105,False
44958,auction_verification,2043,8,0.998042,2039,True
44994,cars,804,18,0.992537,798,False
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44970,QSAR_fish_toxicity,908,7,0.910793,827,False
44959,concrete_compressive_strength,1030,9,0.91068,938,False


# Download single dataset

In [3]:
def load_openml_dataset(dataset_id, 
                        normalize_X:bool = True,
                        normalize_y:bool = True,
                        train_test_size:float = 0.7,
                        split_seed:int = 0,
                        device="cpu",
                        ) -> Tuple[np.ndarray, np.ndarray]:
    # Fetch dataset from OpenML by its ID
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, categorical_indicator, attribute_names = dataset.get_data()
    df.dropna(inplace=True)
    y = np.array(df.pop(dataset.default_target_attribute))[..., None]
    X = np.array(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_test_size, random_state=split_seed)

    #normalize
    if normalize_X:
        X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
        X_train = np.clip(X_train, -3, 3)
        X_test = np.clip(X_test, -3, 3)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    return (torch.tensor(X_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(X_test.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_test.astype(np.float32), requires_grad=False, device=device))

#dataset_id = 44971  # Replace with the dataset ID you want
dataset_id = 44971 #44970
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, False, False)


# Plot Activations

In [4]:
# import torch
# import torch.nn as nn
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# def get_activation(name, activations):
#     def hook(model, input, output):
#         activations[name] = output.detach()
#     return hook


# def register_hooks(model, activations):
#     for name, layer in model.named_modules():
#         print(name)
#         if ".dense" not in name:
#             layer.register_forward_hook(get_activation(name, activations))



# def neuron_distribution_for_each_layer(X_train, y_train, X_test):
#     D = X_train.shape[1]
#     n_layers = 2
#     g1 = torch.Generator().manual_seed(0)
#     model = SampledEulerODE(g1, D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     #model = SampledResNet(g1, D, 10*D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     model.fit(X_train, y_train)

#     activations = {}
#     register_hooks(model, activations)
    
#     # Forward pass
#     model(X_test)
    
#     # Plot input data distribution
#     fig = make_subplots(rows=1, cols=1)
#     fig.add_trace(go.Histogram(x=X_train.flatten().cpu().numpy(), nbinsx=50, name='Train', histnorm='probability density', opacity=0.5))
#     fig.add_trace(go.Histogram(x=X_test.flatten().cpu().numpy(), nbinsx=50, name='Test', histnorm='probability density', opacity=0.5))
#     fig.update_layout(title_text='Input Data Distribution', xaxis_title='Input Feature Value', yaxis_title='Probability Density', barmode='overlay')
#     fig.show()

#     # Plot activations
#     for name, activation in activations.items():
#         fig = make_subplots(rows=1, cols=1)
#         fig.add_trace(go.Histogram(x=activation.flatten().cpu().numpy(), nbinsx=50, name='Activation', histnorm='probability density', opacity=0.5))
#         fig.update_layout(title_text=f'Activations at Layer: {name}', xaxis_title='Activation Value', yaxis_title='Probability Density', barmode='overlay')
#         fig.show()


# neuron_distribution_for_each_layer(X_train, y_train, X_test)

# Fit on a dataset

In [5]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 128
    bottleneck_dim = 2*hidden_size

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        ["RidgeCV", RidgeCVModule, {}],

        ["T=3 End2End", E2EResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 1,
                "n_blocks": 2,
                "activation": nn.Tanh(),
                "loss": nn.MSELoss(),
                "lr": 1e-3,
                "epochs": 50,
                "batch_size": 64,}
                ],

        ["T=1 Dense", ResNet,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "bottleneck_dim": None,
                 "n_blocks": 0,
                 "upsample_layer": "dense",}
                 ],

        ["T=1 SWIM Grad", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",}
                ],
    ]

    for n_blocks in [2]:
        model_list += [
        [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "dense",}
                ],

        [f"T={n_blocks+1} ResDense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "dense",
                "res_layer2": "identity",}
                ],
    ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        rmse_train = root_mean_squared_error(y_train.cpu(), pred_train.cpu().detach()) 
        rmse_test = root_mean_squared_error(y_test.cpu(), pred_test.cpu().detach())

        result = np.array( [rmse_train, rmse_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        dataset_ids: List,
        name_save: str = "PLACEHOLDER",
        device="cpu",
        ):
    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, split_seed=0, device=device)
        generator = torch.Generator(device=device).manual_seed(999)
        results = run_allmodels_1dataset(
            generator, X_train, y_train, X_test, y_test, 
            )
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"OpenML_reg_{name_save}.pkl")
    return df

In [6]:
dataset_ids_not_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_not_categorical = sorted([int(x) for x in dataset_ids_not_categorical])
run_all_experiments(dataset_ids_not_categorical)

100%|██████████| 50/50 [00:04<00:00, 11.41it/s]


 1/20 Processed dataset 44957


100%|██████████| 50/50 [00:02<00:00, 18.20it/s]


 2/20 Processed dataset 44959


100%|██████████| 50/50 [00:02<00:00, 21.10it/s]


 3/20 Processed dataset 44960


100%|██████████| 50/50 [01:49<00:00,  2.20s/it]


 4/20 Processed dataset 44963


100%|██████████| 50/50 [00:51<00:00,  1.03s/it]


 5/20 Processed dataset 44964


100%|██████████| 50/50 [00:02<00:00, 16.71it/s]


 6/20 Processed dataset 44965


100%|██████████| 50/50 [00:28<00:00,  1.73it/s]


 7/20 Processed dataset 44969


100%|██████████| 50/50 [00:02<00:00, 21.22it/s]


 8/20 Processed dataset 44970


100%|██████████| 50/50 [00:12<00:00,  4.09it/s]


 9/20 Processed dataset 44971


100%|██████████| 50/50 [00:04<00:00, 11.81it/s]


 10/20 Processed dataset 44972


100%|██████████| 50/50 [00:24<00:00,  2.06it/s]


 11/20 Processed dataset 44973


100%|██████████| 50/50 [05:46<00:00,  6.94s/it]


 12/20 Processed dataset 44975


100%|██████████| 50/50 [01:46<00:00,  2.14s/it]


 13/20 Processed dataset 44976


100%|██████████| 50/50 [00:45<00:00,  1.11it/s]


 14/20 Processed dataset 44977


100%|██████████| 50/50 [00:18<00:00,  2.76it/s]


 15/20 Processed dataset 44978


100%|██████████| 50/50 [00:18<00:00,  2.77it/s]


 16/20 Processed dataset 44980


100%|██████████| 50/50 [00:18<00:00,  2.75it/s]


 17/20 Processed dataset 44981


100%|██████████| 50/50 [00:30<00:00,  1.64it/s]


 18/20 Processed dataset 44983


100%|██████████| 50/50 [00:01<00:00, 25.03it/s]


 19/20 Processed dataset 44994


100%|██████████| 50/50 [00:06<00:00,  7.19it/s]


 20/20 Processed dataset 45402
      RMSE_test                                                   \
        RidgeCV T=1 Dense T=1 SWIM Grad T=3 End2End T=3 ResDense   
44957  0.674484  0.423936      0.469324    0.424904     0.401338   
44959  0.542088  0.423389      0.364107    0.381762     0.414525   
44960  0.304327  0.201810      0.275287    0.254899     0.214287   
44963  0.839762  0.773993      0.789532    0.604565     0.763296   
44964  0.517322  0.495828      0.486101    0.345597     0.501926   
44965  0.914663  0.914064      0.910656    0.921391     0.929333   
44969  0.413739  0.050529      0.126296    0.133620     0.033593   
44970  0.666021  0.614415      0.620453    0.683948     0.615853   
44971  0.878211  0.835470      0.834845    0.853791     0.840534   
44972  0.766536  0.761113      0.767613    0.864814     0.764720   
44973  0.595158  0.487190      0.484540    0.267764     0.475566   
44975  0.006491  0.216284      0.014534    0.035913     0.292201   
44976  0.294862  

Unnamed: 0_level_0,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_train,RMSE_train,RMSE_train,RMSE_train,...,t_feat,t_feat,t_feat,t_feat,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit
Unnamed: 0_level_1,RidgeCV,T=1 Dense,T=1 SWIM Grad,T=3 End2End,T=3 ResDense,T=3 ResSWIM Grad-dense,RidgeCV,T=1 Dense,T=1 SWIM Grad,T=3 End2End,...,T=1 SWIM Grad,T=3 End2End,T=3 ResDense,T=3 ResSWIM Grad-dense,RidgeCV,T=1 Dense,T=1 SWIM Grad,T=3 End2End,T=3 ResDense,T=3 ResSWIM Grad-dense
44957,0.674484,0.423936,0.469324,0.424904,0.401338,0.389297,0.68945,0.412017,0.470378,0.391112,...,0.031088,0.002582,0.084339,0.019083,0.005936,0.027224,0.195524,5.536774,0.149723,0.09577
44959,0.542088,0.423389,0.364107,0.381762,0.414525,0.354896,0.58769,0.335113,0.33499,0.246201,...,0.042549,0.001247,0.01437,0.013781,0.002968,0.039605,0.133835,2.771779,0.054091,0.115553
44960,0.304327,0.20181,0.275287,0.254899,0.214287,0.17378,0.287106,0.164308,0.246421,0.202584,...,0.013571,0.002254,0.014005,0.015949,0.002454,0.046707,0.085382,2.394616,0.15874,0.183719
44963,0.839762,0.773993,0.789532,0.604565,0.763296,0.782451,0.839653,0.772298,0.788402,0.551501,...,0.021221,0.058223,0.039541,0.120275,0.077115,1.18941,1.107643,109.906274,1.161835,1.459096
44964,0.517322,0.495828,0.486101,0.345597,0.501926,0.468239,0.515351,0.493338,0.478616,0.303082,...,0.01921,0.033119,0.113801,0.109575,0.288647,0.425387,0.509402,51.375105,0.427006,0.64408
44965,0.914663,0.914064,0.910656,0.921391,0.929333,0.894771,0.845714,0.832564,0.848319,0.245918,...,0.068898,0.00131,0.091033,0.011581,0.065435,0.042814,0.126533,3.009805,0.13977,0.119261
44969,0.413739,0.050529,0.126296,0.13362,0.033593,0.083507,0.405012,0.048124,0.118457,0.120107,...,0.075631,0.018597,0.114895,0.029966,0.072067,0.129905,0.231164,28.878969,0.255941,0.308822
44970,0.666021,0.614415,0.620453,0.683948,0.615853,0.63521,0.62382,0.564359,0.552093,0.433803,...,0.001498,0.001251,0.106585,0.103345,0.002586,0.033813,0.126513,2.367048,0.024337,0.134118
44971,0.878211,0.83547,0.834845,0.853791,0.840534,0.830825,0.838359,0.783191,0.782518,0.442274,...,0.057808,0.007245,0.015284,0.020628,0.026167,0.062776,0.207449,12.261402,0.185315,0.172785
44972,0.766536,0.761113,0.767613,0.864814,0.76472,0.783346,0.789445,0.73696,0.760508,0.423389,...,0.008418,0.002817,0.084442,0.017002,0.00317,0.018964,0.133125,4.25049,0.076688,0.191656


In [17]:
df_reg = pd.read_pickle("OpenML_reg_PLACEHOLDER.pkl")
df_reg["RMSE_test"].mean().sort_values()

T=3 End2End               0.445906
T=3 ResSWIM Grad-dense    0.480592
T=1 SWIM Grad             0.497886
T=1 Dense                 0.504190
T=3 ResDense              0.507483
RidgeCV                   0.579533
dtype: float64

In [19]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

T=3 ResSWIM Grad-dense    2.70
T=3 End2End               2.85
T=1 Dense                 3.25
T=1 SWIM Grad             3.35
T=3 ResDense              3.50
RidgeCV                   5.35
dtype: float64

In [22]:
df_reg["RMSE_test"].rank(axis=1)

Unnamed: 0,RidgeCV,T=1 Dense,T=1 SWIM Grad,T=3 End2End,T=3 ResDense,T=3 ResSWIM Grad-dense
44957,6.0,3.0,5.0,4.0,2.0,1.0
44959,6.0,5.0,2.0,3.0,4.0,1.0
44960,6.0,2.0,5.0,4.0,3.0,1.0
44963,6.0,3.0,5.0,1.0,2.0,4.0
44964,6.0,4.0,3.0,1.0,5.0,2.0
44965,4.0,3.0,2.0,5.0,6.0,1.0
44969,6.0,2.0,4.0,5.0,1.0,3.0
44970,5.0,1.0,3.0,6.0,2.0,4.0
44971,6.0,3.0,2.0,5.0,4.0,1.0
44972,3.0,1.0,4.0,6.0,2.0,5.0


In [20]:
df_reg["RMSE_train"].mean().sort_values()

T=3 End2End               0.288300
T=3 ResSWIM Grad-dense    0.452648
T=1 Dense                 0.475713
T=1 SWIM Grad             0.476500
T=3 ResDense              0.476554
RidgeCV                   0.569632
dtype: float64

In [21]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()

T=3 End2End               1.70
T=3 ResSWIM Grad-dense    3.05
T=3 ResDense              3.35
T=1 Dense                 3.50
T=1 SWIM Grad             3.75
RidgeCV                   5.65
dtype: float64