In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule

np.set_printoptions(precision=3, threshold=5) # Print options



# OpenML code

In [2]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id")
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

 1/35 Processed dataset 44956: abalone
 2/35 Processed dataset 44957: airfoil_self_noise
 3/35 Processed dataset 44958: auction_verification
 4/35 Processed dataset 44959: concrete_compressive_strength
 5/35 Processed dataset 44963: physiochemical_protein
 6/35 Processed dataset 44964: superconductivity
 7/35 Processed dataset 44965: geographical_origin_of_music
 8/35 Processed dataset 44966: solar_flare
 9/35 Processed dataset 44969: naval_propulsion_plant
 10/35 Processed dataset 44971: white_wine
 11/35 Processed dataset 44972: red_wine
 12/35 Processed dataset 44973: grid_stability
 13/35 Processed dataset 44974: video_transcoding
 14/35 Processed dataset 44975: wave_energy
 15/35 Processed dataset 44976: sarcos
 16/35 Processed dataset 44977: california_housing
 17/35 Processed dataset 44978: cpu_activity
 18/35 Processed dataset 44979: diamonds
 19/35 Processed dataset 44980: kin8nm
 20/35 Processed dataset 44981: pumadyn32nh
 21/35 Processed dataset 44983: miami_housing
 22/35 P

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
44973,grid_stability,10000,13,1.0,10000,False
44975,wave_energy,72000,49,0.999903,71993,False
44980,kin8nm,8192,9,0.999878,8191,False
44981,pumadyn32nh,8192,33,0.999878,8191,False
45402,space_ga,3107,7,0.999356,3105,False
44958,auction_verification,2043,8,0.998042,2039,True
44994,cars,804,18,0.992537,798,False
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44970,QSAR_fish_toxicity,908,7,0.910793,827,False
44959,concrete_compressive_strength,1030,9,0.91068,938,False


# Download single dataset

In [3]:
def load_openml_dataset(dataset_id, 
                        normalize_X:bool = True,
                        normalize_y:bool = True,
                        train_test_size:float = 0.7,
                        split_seed:int = 0,
                        device="cpu",
                        ) -> Tuple[np.ndarray, np.ndarray]:
    # Fetch dataset from OpenML by its ID
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, categorical_indicator, attribute_names = dataset.get_data()
    df.dropna(inplace=True)
    y = np.array(df.pop(dataset.default_target_attribute))[..., None]
    X = np.array(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_test_size, random_state=split_seed)

    #normalize
    if normalize_X:
        X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
        X_train = np.clip(X_train, -3, 3)
        X_test = np.clip(X_test, -3, 3)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    return (torch.tensor(X_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(X_test.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_test.astype(np.float32), requires_grad=False, device=device))

#dataset_id = 44971  # Replace with the dataset ID you want
dataset_id = 44971 #44970
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, False, False)


# Plot Activations

In [4]:
# import torch
# import torch.nn as nn
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# def get_activation(name, activations):
#     def hook(model, input, output):
#         activations[name] = output.detach()
#     return hook


# def register_hooks(model, activations):
#     for name, layer in model.named_modules():
#         print(name)
#         if ".dense" not in name:
#             layer.register_forward_hook(get_activation(name, activations))



# def neuron_distribution_for_each_layer(X_train, y_train, X_test):
#     D = X_train.shape[1]
#     n_layers = 2
#     g1 = torch.Generator().manual_seed(0)
#     model = SampledEulerODE(g1, D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     #model = SampledResNet(g1, D, 10*D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     model.fit(X_train, y_train)

#     activations = {}
#     register_hooks(model, activations)
    
#     # Forward pass
#     model(X_test)
    
#     # Plot input data distribution
#     fig = make_subplots(rows=1, cols=1)
#     fig.add_trace(go.Histogram(x=X_train.flatten().cpu().numpy(), nbinsx=50, name='Train', histnorm='probability density', opacity=0.5))
#     fig.add_trace(go.Histogram(x=X_test.flatten().cpu().numpy(), nbinsx=50, name='Test', histnorm='probability density', opacity=0.5))
#     fig.update_layout(title_text='Input Data Distribution', xaxis_title='Input Feature Value', yaxis_title='Probability Density', barmode='overlay')
#     fig.show()

#     # Plot activations
#     for name, activation in activations.items():
#         fig = make_subplots(rows=1, cols=1)
#         fig.add_trace(go.Histogram(x=activation.flatten().cpu().numpy(), nbinsx=50, name='Activation', histnorm='probability density', opacity=0.5))
#         fig.update_layout(title_text=f'Activations at Layer: {name}', xaxis_title='Activation Value', yaxis_title='Probability Density', barmode='overlay')
#         fig.show()


# neuron_distribution_for_each_layer(X_train, y_train, X_test)

# Fit on a dataset

In [11]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 128
    bottleneck_dim = 2*hidden_size

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        ["RidgeCV", RidgeCVModule, {}],

        ["T=1 Dense", ResNet,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "bottleneck_dim": None,
                 "n_blocks": 0,
                 "upsample_layer": "dense",}
                 ],

        ["T=1 SWIM Grad", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",}
                ],
        
        ["T=1 SWIM Unif", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",
                "sampling_method": "uniform",}
                ],
    ]

    for n_blocks in [2, 4]:
        model_list += [
        [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "dense",}
                ],

        [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "identity",}
                ],
        [f"T={n_blocks+1} ResSWIM Grad-dense UPDENSE", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "SWIM",
                "res_layer2": "dense",}
                ],

        [f"T={n_blocks+1} ResSWIM Grad-id UPDENSE", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "SWIM",
                "res_layer2": "identity",}
                ],

        [f"T={n_blocks+1} ResDense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "dense",
                "res_layer2": "identity",}
                ],
    ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        with torch.no_grad():
            t0 = time.perf_counter()
            model = model(**model_args).to(X_train.device)
            pred_train, _ = model.fit(X_train, y_train)
            t1 = time.perf_counter()
            pred_test = model(X_test)
            t2 = time.perf_counter()
            rmse_train = root_mean_squared_error(y_train.cpu(), pred_train.cpu()) 
            rmse_test = root_mean_squared_error(y_test.cpu(), pred_test.cpu())

            result = np.array( [rmse_train, rmse_test, t1-t0, t2-t1] )
            results.append( result )
            model_names.append( name )

    return model_names, results



def run_all_experiments(
        dataset_ids: List,
        name_save: str = "PLACEHOLDER",
        device="cpu",
        ):
    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, split_seed=0, device=device)
        generator = torch.Generator(device=device).manual_seed(999)
        results = run_allmodels_1dataset(
            generator, X_train, y_train, X_test, y_test, 
            )
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"OpenML_reg_{name_save}.pkl")
    return df

In [12]:
dataset_ids_not_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_not_categorical = [int(x) for x in dataset_ids_not_categorical]
run_all_experiments(dataset_ids_not_categorical)

 1/20 Processed dataset 44973
 2/20 Processed dataset 44975
 3/20 Processed dataset 44980
 4/20 Processed dataset 44981
 5/20 Processed dataset 45402
 6/20 Processed dataset 44994
 7/20 Processed dataset 44957
 8/20 Processed dataset 44970
 9/20 Processed dataset 44959
 10/20 Processed dataset 44960
 11/20 Processed dataset 44963
 12/20 Processed dataset 44976
 13/20 Processed dataset 44977
 14/20 Processed dataset 44983
 15/20 Processed dataset 44964
 16/20 Processed dataset 44965
 17/20 Processed dataset 44978
 18/20 Processed dataset 44969
 19/20 Processed dataset 44972
 20/20 Processed dataset 44971
      RMSE_test                                                     \
        RidgeCV T=1 Dense T=1 SWIM Grad T=1 SWIM Unif T=3 ResDense   
44973  0.595158  0.474014      0.512567      0.500707     0.491567   
44975  0.006491  0.201343      0.012490      0.015949     0.270486   
44980  0.771311  0.563822      0.568452      0.620518     0.583474   
44981  0.904478  0.906585      0.903832

Unnamed: 0_level_0,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,...,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit
Unnamed: 0_level_1,RidgeCV,T=1 Dense,T=1 SWIM Grad,T=1 SWIM Unif,T=3 ResDense,T=3 ResSWIM Grad-dense,T=3 ResSWIM Grad-dense UPDENSE,T=3 ResSWIM Grad-id,T=3 ResSWIM Grad-id UPDENSE,T=5 ResDense,...,T=3 ResDense,T=3 ResSWIM Grad-dense,T=3 ResSWIM Grad-dense UPDENSE,T=3 ResSWIM Grad-id,T=3 ResSWIM Grad-id UPDENSE,T=5 ResDense,T=5 ResSWIM Grad-dense,T=5 ResSWIM Grad-dense UPDENSE,T=5 ResSWIM Grad-id,T=5 ResSWIM Grad-id UPDENSE
44973,0.595158,0.474014,0.512567,0.500707,0.491567,0.476908,0.462984,0.467011,0.477703,0.5029,...,0.200511,0.462708,0.476597,0.364117,0.336238,0.227336,0.35689,0.308436,0.379102,0.29159
44975,0.006491,0.201343,0.01249,0.015949,0.270486,0.032823,0.196621,0.035932,0.223442,0.359795,...,2.179623,2.627786,3.403556,2.81609,2.445383,2.5095,3.825774,3.0092,3.09283,3.587232
44980,0.771311,0.563822,0.568452,0.620518,0.583474,0.589465,0.60736,0.52179,0.599301,0.672451,...,0.213039,0.274598,0.604412,0.246491,0.275195,0.301195,0.295563,0.274974,0.258757,0.360342
44981,0.904478,0.906585,0.903832,0.904018,0.901855,0.90405,0.906565,0.903489,0.906099,0.917915,...,0.223871,0.316449,0.332408,0.378942,0.297078,0.22807,0.445072,0.368996,0.310041,0.344226
45402,0.70669,0.614241,0.693941,0.727876,0.584934,0.684362,0.61519,0.692317,0.595049,0.564018,...,0.151128,0.170343,0.176476,0.169969,0.172686,0.103309,0.186669,0.202834,0.163451,0.224247
44994,0.296725,0.231846,0.257324,0.253231,0.235761,0.227032,0.254914,0.231036,0.249284,0.245621,...,0.081535,0.102016,0.08554,0.06936,0.115756,0.076137,0.112446,0.137002,0.076171,0.089838
44957,0.674484,0.42789,0.470428,0.514897,0.413744,0.406055,0.406067,0.363779,0.389107,0.426108,...,0.126595,0.162076,0.065339,0.06735,0.067704,0.084409,0.067421,0.142091,0.132049,0.178356
44970,0.666021,0.625791,0.618279,0.641247,0.629893,0.631583,0.626008,0.62856,0.631417,0.616787,...,0.075887,0.136166,0.138136,0.076495,0.068097,0.101963,0.1302,0.086043,0.161279,0.141356
44959,0.542088,0.394469,0.361735,0.453,0.404947,0.377535,0.371643,0.377106,0.339317,0.435315,...,0.061303,0.162756,0.060949,0.105737,0.068992,0.146916,0.14272,0.144615,0.105701,0.152805
44960,0.304327,0.216437,0.261001,0.267134,0.215401,0.160237,0.195793,0.261051,0.192583,0.235288,...,0.077196,0.156039,0.071945,0.052134,0.132817,0.079126,0.089108,0.129641,0.074231,0.159434


In [13]:
df_reg = pd.read_pickle("OpenML_reg_PLACEHOLDER.pkl")
df_reg["RMSE_test"].mean().sort_values()

T=3 ResSWIM Grad-id               0.484309
T=5 ResSWIM Grad-id               0.486952
T=3 ResSWIM Grad-dense            0.487929
T=5 ResSWIM Grad-dense            0.492101
T=5 ResSWIM Grad-id UPDENSE       0.496408
T=5 ResSWIM Grad-dense UPDENSE    0.497866
T=1 SWIM Grad                     0.498297
T=3 ResSWIM Grad-id UPDENSE       0.498767
T=3 ResSWIM Grad-dense UPDENSE    0.501172
T=1 Dense                         0.501478
T=3 ResDense                      0.505489
T=1 SWIM Unif                     0.522766
T=5 ResDense                      0.523704
RidgeCV                           0.579533
dtype: float64

In [14]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

T=3 ResSWIM Grad-id                4.80
T=5 ResSWIM Grad-id                6.10
T=5 ResSWIM Grad-id UPDENSE        6.40
T=3 ResSWIM Grad-dense             6.55
T=5 ResSWIM Grad-dense UPDENSE     6.65
T=5 ResSWIM Grad-dense             6.90
T=1 SWIM Grad                      7.25
T=3 ResDense                       7.25
T=3 ResSWIM Grad-dense UPDENSE     7.25
T=3 ResSWIM Grad-id UPDENSE        7.45
T=1 Dense                          7.65
T=1 SWIM Unif                      9.30
T=5 ResDense                       9.35
RidgeCV                           12.10
dtype: float64

In [15]:
df_reg["RMSE_train"].mean().sort_values()

T=3 ResSWIM Grad-id               0.455198
T=5 ResSWIM Grad-id               0.459098
T=5 ResSWIM Grad-dense            0.459615
T=3 ResSWIM Grad-dense            0.461495
T=5 ResSWIM Grad-id UPDENSE       0.466103
T=3 ResSWIM Grad-id UPDENSE       0.467362
T=5 ResSWIM Grad-dense UPDENSE    0.468648
T=3 ResSWIM Grad-dense UPDENSE    0.469117
T=1 SWIM Grad                     0.472960
T=1 Dense                         0.473068
T=3 ResDense                      0.474911
T=5 ResDense                      0.491259
T=1 SWIM Unif                     0.496572
RidgeCV                           0.569632
dtype: float64

In [18]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()

T=5 ResSWIM Grad-id UPDENSE        5.55
T=3 ResSWIM Grad-id                5.65
T=3 ResSWIM Grad-id UPDENSE        6.15
T=3 ResSWIM Grad-dense UPDENSE     6.30
T=5 ResSWIM Grad-dense             6.30
T=5 ResSWIM Grad-id                6.80
T=5 ResSWIM Grad-dense UPDENSE     7.05
T=1 Dense                          7.15
T=3 ResDense                       7.15
T=1 SWIM Grad                      7.50
T=3 ResSWIM Grad-dense             7.75
T=5 ResDense                       9.05
T=1 SWIM Unif                      9.45
RidgeCV                           13.15
dtype: float64