In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule, E2EResNet, StagewiseRandFeatBoostRegression

np.set_printoptions(precision=3, threshold=5) # Print options

In [None]:
from models import FittableModule, create_layer, GradientRandFeatBoostRegression
from ridge_ALOOCV import fit_ridge_ALOOCV


N = 100
D = 50
p = 30
d = 4
bottleneck_dim = 70

gen = torch.Generator().manual_seed(42)
X = torch.randn(N, D, generator=gen)
y = torch.randn(N, d, generator=gen)
model = GradientRandFeatBoostRegression(
        gen,
        hidden_dim = D,
        bottleneck_dim = bottleneck_dim,
        out_dim = d,
        n_layers = 5,
        upscale = "dense",
    )
_, _ = model.fit(X, y)

# OpenML code

In [7]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

 1/35 Processed dataset 44956: abalone
 2/35 Processed dataset 44957: airfoil_self_noise
 3/35 Processed dataset 44958: auction_verification
 4/35 Processed dataset 44959: concrete_compressive_strength
 5/35 Processed dataset 44963: physiochemical_protein
 6/35 Processed dataset 44964: superconductivity
 7/35 Processed dataset 44965: geographical_origin_of_music
 8/35 Processed dataset 44966: solar_flare
 9/35 Processed dataset 44969: naval_propulsion_plant
 10/35 Processed dataset 44971: white_wine
 11/35 Processed dataset 44972: red_wine
 12/35 Processed dataset 44973: grid_stability
 13/35 Processed dataset 44974: video_transcoding
 14/35 Processed dataset 44975: wave_energy
 15/35 Processed dataset 44976: sarcos
 16/35 Processed dataset 44977: california_housing
 17/35 Processed dataset 44978: cpu_activity
 18/35 Processed dataset 44979: diamonds
 19/35 Processed dataset 44980: kin8nm
 20/35 Processed dataset 44981: pumadyn32nh
 21/35 Processed dataset 44983: miami_housing
 22/35 P

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41021,Moneyball,1232,15,0.303571,374,True
44956,abalone,4177,9,0.006703,28,True
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44958,auction_verification,2043,8,0.998042,2039,True
44959,concrete_compressive_strength,1030,9,0.91068,938,False
44960,energy_efficiency,768,9,0.764323,587,False
44962,forest_fires,517,13,0.485493,251,True
44963,physiochemical_protein,45730,10,0.347759,15903,False
44964,superconductivity,21263,82,0.141419,3007,False
44965,geographical_origin_of_music,1059,117,0.029273,31,False


# Download single dataset

In [None]:
def load_openml_dataset(dataset_id, 
                        normalize_X:bool = True,
                        normalize_y:bool = True,
                        train_test_size:float = 0.7,
                        split_seed:int = 0,
                        device="cpu",
                        ) -> Tuple[np.ndarray, np.ndarray]:
    # Fetch dataset from OpenML by its ID
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, categorical_indicator, attribute_names = dataset.get_data()
    df.dropna(inplace=True)
    y = np.array(df.pop(dataset.default_target_attribute))[..., None]
    X = np.array(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_test_size, random_state=split_seed)

    #normalize
    if normalize_X:
        X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
        X_train = np.clip(X_train, -3, 3)
        X_test = np.clip(X_test, -3, 3)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    return (torch.tensor(X_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(X_test.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_train.astype(np.float32), requires_grad=False, device=device),
            torch.tensor(y_test.astype(np.float32), requires_grad=False, device=device))

#dataset_id = 44971  # Replace with the dataset ID you want
dataset_id = 44971 #44970
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, False, False)


# Plot Activations

In [None]:
# import torch
# import torch.nn as nn
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# def get_activation(name, activations):
#     def hook(model, input, output):
#         activations[name] = output.detach()
#     return hook


# def register_hooks(model, activations):
#     for name, layer in model.named_modules():
#         print(name)
#         if ".dense" not in name:
#             layer.register_forward_hook(get_activation(name, activations))



# def neuron_distribution_for_each_layer(X_train, y_train, X_test):
#     D = X_train.shape[1]
#     n_layers = 2
#     g1 = torch.Generator().manual_seed(0)
#     model = SampledEulerODE(g1, D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     #model = SampledResNet(g1, D, 10*D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
#     model.fit(X_train, y_train)

#     activations = {}
#     register_hooks(model, activations)
    
#     # Forward pass
#     model(X_test)
    
#     # Plot input data distribution
#     fig = make_subplots(rows=1, cols=1)
#     fig.add_trace(go.Histogram(x=X_train.flatten().cpu().numpy(), nbinsx=50, name='Train', histnorm='probability density', opacity=0.5))
#     fig.add_trace(go.Histogram(x=X_test.flatten().cpu().numpy(), nbinsx=50, name='Test', histnorm='probability density', opacity=0.5))
#     fig.update_layout(title_text='Input Data Distribution', xaxis_title='Input Feature Value', yaxis_title='Probability Density', barmode='overlay')
#     fig.show()

#     # Plot activations
#     for name, activation in activations.items():
#         fig = make_subplots(rows=1, cols=1)
#         fig.add_trace(go.Histogram(x=activation.flatten().cpu().numpy(), nbinsx=50, name='Activation', histnorm='probability density', opacity=0.5))
#         fig.update_layout(title_text=f'Activations at Layer: {name}', xaxis_title='Activation Value', yaxis_title='Probability Density', barmode='overlay')
#         fig.show()


# neuron_distribution_for_each_layer(X_train, y_train, X_test)

# Fit on a dataset

In [None]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 128
    bottleneck_dim = hidden_size

    # (name, model, kwargs). kwargs separate to save memory
    model_list = [
        ["RidgeCV", RidgeCVModule, {}],

        ["T=3 End2End", E2EResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 1,
                "n_blocks": 2,
                "activation": nn.ReLU(),
                "loss": nn.MSELoss(),
                "lr": 1e-3,
                "epochs": 50,
                "batch_size": 64,}
                ],

        ["T=1 Dense", ResNet,
                {"generator": generator,
                 "in_dim": D,
                 "hidden_size": hidden_size,
                 "bottleneck_dim": None,
                 "n_blocks": 0,
                 "upsample_layer": "dense",}
                 ],

        ["T=1 SWIM Grad", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",}
                ],
        
        ["T=1 SWIM Unif", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": None,
                "n_blocks": 0,
                "upsample_layer": "SWIM",
                "sampling_method": "uniform",}
                ],
    ]

    for n_blocks in [2, 4]:
        model_list += [
        [f"T={n_blocks+1} ResSWIM Grad-dense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "dense",}
                ],

        [f"T={n_blocks+1} ResSWIM Grad-id", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "SWIM",
                "res_layer1": "SWIM",
                "res_layer2": "identity",}
                ],
                
        [f"T={n_blocks+1} ResDense", ResNet,
                {"generator": generator,
                "in_dim": D,
                "hidden_size": hidden_size,
                "bottleneck_dim": hidden_size,
                "n_blocks": n_blocks,
                "upsample_layer": "dense",
                "res_layer1": "dense",
                "res_layer2": "identity",}
                ],
        ]
        
    for n_layers in range(0, 50, 5):
        model_list += [
        [f"StagewiseRandFeatBoost_{n_layers}", StagewiseRandFeatBoostRegression,
                {"generator": generator,
                "hidden_dim": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 1,
                "n_layers": n_layers,
                "activation": nn.Tanh(),
                "l2_reg": 1,  #TODO experiment with much higher l2reg than 0.01
                "feature_type": "SWIM",
                "boost_lr": 1.0,
                "upscale": "dense",}
                ],
        
        [f"GradientRandFeatBoost_{n_layers}", GradientRandFeatBoostRegression,
                {"generator": generator,
                "hidden_dim": hidden_size,
                "bottleneck_dim": bottleneck_dim,
                "out_dim": 1,
                "n_layers": n_layers,
                "activation": nn.Tanh(),
                "l2_reg": 1,  #TODO experiment with much higher l2reg than 0.01
                "feature_type": "SWIM",
                "boost_lr": 1.0,
                "upscale": "dense",}
                ],
        ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        rmse_train = root_mean_squared_error(y_train.cpu(), pred_train.cpu().detach()) 
        rmse_test = root_mean_squared_error(y_test.cpu(), pred_test.cpu().detach())

        result = np.array( [rmse_train, rmse_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        dataset_ids: List,
        name_save: str = "PLACEHOLDER",
        device="cuda",
        ):
    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, split_seed=0, device=device)
        generator = torch.Generator(device=device).manual_seed(999)
        results = run_allmodels_1dataset(
            generator, X_train, y_train, X_test, y_test, 
            )
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"OpenML_reg_{name_save}.pkl")
    return df

In [None]:
dataset_ids_not_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_not_categorical = sorted([int(x) for x in dataset_ids_not_categorical])
run_all_experiments(dataset_ids_not_categorical[0:3], name_save="FIRSTBOOST128TestingGradient")

In [3]:
df_reg = pd.read_pickle("OpenML_reg_FIRSTBOOST128TestingGradient.pkl")
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,GradientRandFeatBoost_0,GradientRandFeatBoost_10,GradientRandFeatBoost_15,GradientRandFeatBoost_20,GradientRandFeatBoost_25,GradientRandFeatBoost_30,GradientRandFeatBoost_35,GradientRandFeatBoost_40,GradientRandFeatBoost_45,GradientRandFeatBoost_5,...,T=1 Dense,T=1 SWIM Grad,T=1 SWIM Unif,T=3 End2End,T=3 ResDense,T=3 ResSWIM Grad-dense,T=3 ResSWIM Grad-id,T=5 ResDense,T=5 ResSWIM Grad-dense,T=5 ResSWIM Grad-id
44957,0.674485,0.437022,0.349125,0.417815,0.433439,0.374685,0.416047,0.396203,0.378156,0.418616,...,0.42468,0.476077,0.500399,0.344883,0.42956,0.347232,0.376446,0.421611,0.431505,0.373735
44959,0.542131,0.378737,0.36435,0.351177,0.35876,0.351232,0.37559,0.361841,0.368902,0.362432,...,0.377728,0.423298,0.44655,0.40213,0.415131,0.391737,0.379706,0.422805,0.354249,0.353861
44960,0.304226,0.223144,0.224409,0.170269,0.196235,0.161322,0.179944,0.160644,0.146851,0.235378,...,0.200248,0.259649,0.264216,0.261491,0.218697,0.235113,0.248284,0.22824,0.17428,0.235032


In [None]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

In [None]:
df_reg["RMSE_train"].mean().sort_values()

In [None]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()

In [None]:
df_reg = pd.read_pickle("OpenML_reg_FIRSTBOOST128.pkl")
df_reg["RMSE_test"].mean().sort_values()

In [None]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

In [None]:
df_reg["RMSE_train"].mean().sort_values()

In [None]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()

In [None]:
df_reg = pd.read_pickle("OpenML_reg_FIRSTBOOST512.pkl")    # BAD TEST PERFORMANCE DUE TO REGULARIZATION ??
df_reg["RMSE_test"].mean().sort_values()

In [None]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

In [None]:
df_reg["RMSE_train"].mean().sort_values()

In [None]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()

In [None]:
df_reg = pd.read_pickle("OpenML_reg_FIRSTBOOST512lambda1.pkl")
df_reg["RMSE_test"].mean().sort_values()

In [None]:
df_reg["RMSE_test"].rank(axis=1).mean().sort_values()

In [None]:
df_reg["RMSE_train"].mean().sort_values()

In [None]:
df_reg["RMSE_train"].rank(axis=1).mean().sort_values()