In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import openml
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.functional import relu
from torch.nn.functional import tanh
import pandas as pd
import numpy as np

from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from aeon.datasets.tser_datasets import tser_soton
from aeon.datasets import load_regression, load_classification
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape

np.set_printoptions(precision=3, threshold=5) # Print options



# OpenML code

In [2]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id")
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

 1/35 Processed dataset 44956: abalone
 2/35 Processed dataset 44957: airfoil_self_noise
 3/35 Processed dataset 44958: auction_verification
 4/35 Processed dataset 44959: concrete_compressive_strength
 5/35 Processed dataset 44963: physiochemical_protein
 6/35 Processed dataset 44964: superconductivity
 7/35 Processed dataset 44965: geographical_origin_of_music
 8/35 Processed dataset 44966: solar_flare
 9/35 Processed dataset 44969: naval_propulsion_plant
 10/35 Processed dataset 44971: white_wine
 11/35 Processed dataset 44972: red_wine
 12/35 Processed dataset 44973: grid_stability
 13/35 Processed dataset 44974: video_transcoding
 14/35 Processed dataset 44975: wave_energy
 15/35 Processed dataset 44976: sarcos
 16/35 Processed dataset 44977: california_housing
 17/35 Processed dataset 44978: cpu_activity
 18/35 Processed dataset 44979: diamonds
 19/35 Processed dataset 44980: kin8nm
 20/35 Processed dataset 44981: pumadyn32nh
 21/35 Processed dataset 44983: miami_housing
 22/35 P

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
44973,grid_stability,10000,13,1.0,10000,False
44975,wave_energy,72000,49,0.999903,71993,False
44980,kin8nm,8192,9,0.999878,8191,False
44981,pumadyn32nh,8192,33,0.999878,8191,False
45402,space_ga,3107,7,0.999356,3105,False
44958,auction_verification,2043,8,0.998042,2039,True
44994,cars,804,18,0.992537,798,False
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44970,QSAR_fish_toxicity,908,7,0.910793,827,False
44959,concrete_compressive_strength,1030,9,0.91068,938,False


# Download single dataset

In [3]:
def load_openml_dataset(dataset_id, 
                        normalize_X:bool = True,
                        normalize_y:bool = True,
                        train_test_size:float = 0.7,
                        split_seed:int = 0,
                        device="cpu",
                        ) -> Tuple[np.ndarray, np.ndarray]:
    # Fetch dataset from OpenML by its ID
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, categorical_indicator, attribute_names = dataset.get_data()
    df.dropna(inplace=True)
    y = np.array(df.pop(dataset.default_target_attribute))[..., None]
    X = np.array(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_test_size, random_state=split_seed)

    #normalize
    if normalize_X:
        X_train, X_test = normalize_mean_std_traindata(X_train, X_test)
        X_train = np.clip(X_train, -3, 3)
        X_test = np.clip(X_test, -3, 3)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    return (Tensor(X_train.astype(np.float32)).to(device), 
            Tensor(X_test.astype(np.float32)).to(device), 
            Tensor(y_train.astype(np.float32)).to(device), 
            Tensor(y_test.astype(np.float32)).to(device))

#dataset_id = 44971  # Replace with the dataset ID you want
dataset_id = 44971 #44970
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, False, False)


# nn.Module for sampled networks

In [4]:
#################################################################
##### Base classes                                          #####
##### - FittableModule: A nn.Module with .fit(X, y) support #####
##### - ResNetBase: which interatively calls .fit(X, y)     #####
#################################################################

class FittableModule(nn.Module):
    def __init__(self):
        super(FittableModule, self).__init__()
    

    def fit(self, 
            X: Optional[Tensor] = None, 
            y: Optional[Tensor] = None,
        ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
        """Given neurons of the previous layer, and target labels, fit the 
        module. Returns the forwarded activations and labels [f(X), y].

        Args:
            X (Optional[Tensor]): Forward-propagated activations of training data, shape (N, d).
            y (Optional[Tensor]): Training labels, shape (N, p).
        
        Returns:
            Forwarded activations and labels [f(X), y].
        """
        return self(X), y



class ResNetBase(FittableModule):
    def __init__(self,
                upsample:FittableModule,
                blocks:List[FittableModule],
                output_layer:FittableModule,
                ):
        """Residual Network base class, with fit method for non-SGD training/initialization.

        Args:
            upsample (FittableModule): _description_
            blocks (List[FittableModule]): _description_
            output_layer (FittableModule): _description_
        """
        super(ResNetBase, self).__init__()
        self.upsample = upsample
        self.blocks = nn.ModuleList(blocks)
        self.output_layer = output_layer

    
    def fit(self, X:Tensor, y:Tensor):
        # X shape (N, d)
        # y shape (N, p)
        X, y = self.upsample.fit(X, y)
        for block in self.blocks:
            X, y = block.fit(X, y)
        X, y = self.output_layer.fit(X, y)
        return X, y

    
    def forward(self, x:Tensor) -> Tensor:
        # x shape (N, d)
        x = self.upsample(x)
        for block in self.blocks:
            x = block(x)
        x = self.output_layer(x)
        return x

In [5]:
##############################
######## Dense Layer ########
#############################


class Dense(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 out_dim: int,
                 ):
        """Dense MLP layer with LeCun weight initialization,
        Gaussan bias initialization."""
        super(Dense, self).__init__()
        self.generator = generator
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.dense = nn.Linear(in_dim, out_dim)
    
    def fit(self, X:Tensor, y:Tensor):
        with torch.no_grad():
            nn.init.normal_(self.dense.weight, mean=0, std=self.in_dim**-0.5, generator=self.generator)
            nn.init.normal_(self.dense.bias, mean=0, std=self.in_dim**-0.25, generator=self.generator)
            return self(X), y
    
    def forward(self, X):
        return self.dense(X)
    

class Identity(FittableModule):
    def __init__(self):
        super(Identity, self).__init__()
    
    def fit(self, X:Tensor, y:Tensor):
        return X, y
    
    def forward(self, X):
        return X


D = X_train.shape[1]
g1 = torch.Generator().manual_seed(0)
net = Dense(g1, D, 3)
net.fit(X_train, y_train)
out = net(X_test)
print_name(out)
print(net)

torch.Size([1470, 3]) out torch.float32
tensor([[-15.0697,  -1.0931,  29.7537],
        [-11.4016,  -7.3552,  30.6810],
        [ -5.4838,  -9.1904,  20.7735],
        ...,
        [  0.9532, -14.4864,  15.1146],
        [  0.7565, -25.6293,  27.8604],
        [-11.9715,  -7.1950,  33.1178]], grad_fn=<AddmmBackward0>) 

Dense(
  (dense): Linear(in_features=11, out_features=3, bias=True)
)


In [6]:
###############################
#### Pair Sampled Networks ####
###############################


class PairSampledLinear(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int, 
                 out_dim: int,
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """Dense MLP layer with pair sampled weights.

        Args:
            generator (torch.Generator): PRNG object.
            in_dim (int): Input dimension.
            out_dim (int): Output dimension.
            sampling_method (str): Pair sampling method. Uniform or gradient-weighted.
        """
        super(PairSampledLinear, self).__init__()
        self.generator = generator
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.dense = nn.Linear(in_dim, out_dim)
        self.sampling_method = sampling_method


    def fit(self, 
            X: Tensor, 
            y: Tensor,
        ) -> Tuple[Tensor, Tensor]:
        """Given forward-propagated training data X at the previous 
        hidden layer, and supervised target labels y, fit the weights
        iteratively by letting rows of the weight matrix be given by
        pairs of samples from X. See paper for more details.

        Args:
            X (Tensor): Forward-propagated activations of training data, shape (N, d).
            y (Tensor): Training labels, shape (N, p).
        
        Returns:
            Forwarded activations and labels [f(X), y].
        """
        with torch.no_grad():
            N, d = X.shape
            dtype = X.dtype
            device = X.device
            EPS = torch.tensor(0.1, dtype=dtype, device=device)

            #obtain pair indices
            n = 5*N
            idx1 = torch.arange(0, n, dtype=torch.int32, device=device) % N
            delta = torch.randint(1, N, size=(n,), dtype=torch.int32, device=device, generator=self.generator)
            idx2 = (idx1 + delta) % N
            dx = X[idx2] - X[idx1]
            dists = torch.linalg.norm(dx, axis=1, keepdims=True)
            dists = torch.maximum(dists, EPS)
            
            if self.sampling_method=="gradient":
                #calculate 'gradients'
                dy = y[idx2] - y[idx1]
                y_norm = torch.linalg.norm(dy, axis=1, keepdims=True) #NOTE 2023 paper uses ord=inf instead of ord=2
                grad = (y_norm / dists).reshape(-1) 
                p = grad/grad.sum()
            elif self.sampling_method=="uniform":
                p = torch.ones(n, dtype=dtype, device=device) / n
            else:
                raise ValueError(f"sampling_method must be 'uniform' or 'gradient'. Given: {self.sampling_method}")

            #sample pairs
            selected_idx = torch.multinomial(
                p,
                self.out_dim,
                replacement=True,
                generator=self.generator
                )
            idx1 = idx1[selected_idx]
            dx = dx[selected_idx]
            dists = dists[selected_idx]

            #define weights and biases
            weights = dx / (dists**2)
            biases = -torch.einsum('ij,ij->i', weights, X[idx1]) - 0.5
            self.dense.weight.data = weights
            self.dense.bias.data = biases
            return self(X), y
    

    def forward(self, X):
        return self.dense(X)
    
    
D = X_train.shape[1]
g1 = torch.Generator().manual_seed(0)
net = PairSampledLinear(g1, D, 3)
net.fit(X_train, y_train)
out = net(X_test)
print_name(out)
print(net)

torch.Size([1470, 3]) out torch.float32
tensor([[-2.1594, -0.4847,  2.5945],
        [ 1.8891, -0.1554,  1.1345],
        [ 0.9068,  0.9833, -2.2117],
        ...,
        [ 2.5997,  1.8578, -5.1088],
        [ 9.5946,  0.9112, -3.4865],
        [ 2.3178, -0.3220,  1.5852]], grad_fn=<AddmmBackward0>) 

PairSampledLinear(
  (dense): Linear(in_features=11, out_features=3, bias=True)
)


In [7]:
###################################
#### Sampled Bottleneck ResNet ####
###################################


class SampledResBlock(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 hidden_dim: int, 
                 activation_dim: int,
                 activation: nn.Module = nn.Tanh(),
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """A sampled layer followed by activation and linear layer.
        Equivalent to a 1-hidden-layer Sampled Neural Network.

        Args:
            generator (torch.Generator): PRNG object.
            in_dim (int): Input dimension.
            out_dim (int): Output dimension.
            activation (nn.Module): Activation function.
            sampling_method (str): Pair sampling method. Uniform or gradient-weighted.
        """
        super(SampledResBlock, self).__init__()
        self.generator = generator
        self.sampled_linear = PairSampledLinear(generator, hidden_dim, activation_dim, sampling_method)
        self.activation = activation
        self.upscale = Dense(generator, activation_dim, hidden_dim)
    

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        with torch.no_grad():
            X0 = X
            X, y = self.sampled_linear.fit(X, y)
            X = self.activation(X)
            X, y = self.upscale.fit(X, y)
            return X0 + X, y

    
    def forward(self, X):
        X0 = X
        X = self.sampled_linear(X)
        X = self.activation(X)
        X = self.upscale(X)
        return X0 + X
    
    
D = X_train.shape[1]
g1 = torch.Generator().manual_seed(0)
net = SampledResBlock(g1, D, 3)
net.fit(X_train, y_train)
out = net(X_test)
print_name(out)
print(net)

torch.Size([1470, 11]) out torch.float32
tensor([[ 6.1840, -2.1159,  0.0658,  ...,  3.1860,  1.2747,  9.8982],
        [ 5.8258, -0.8377, -1.6582,  ...,  3.4401,  0.3498,  9.0981],
        [ 8.0582, -0.1792, -0.7648,  ...,  3.9503, -0.9390, 11.5953],
        ...,
        [ 6.4635,  0.1345, -1.4293,  ...,  3.9751, -1.0798, 12.2668],
        [ 6.5396,  0.0611, -1.4934,  ...,  4.1369, -1.0826, 11.0263],
        [ 5.6176, -0.6594, -1.4017,  ...,  3.6212,  0.3895, 11.4360]],
       grad_fn=<AddBackward0>) 

SampledResBlock(
  (sampled_linear): PairSampledLinear(
    (dense): Linear(in_features=11, out_features=3, bias=True)
  )
  (activation): Tanh()
  (upscale): Dense(
    (dense): Linear(in_features=3, out_features=11, bias=True)
  )
)


In [8]:
#####################
### RidgeCV Layer ###
#####################

class RidgeCVModule(FittableModule):
    def __init__(self, alphas=np.logspace(-1, 3, 10)):
        """RidgeCV layer using sklearn's RidgeCV. TODO dont use sklearn"""
        super(RidgeCVModule, self).__init__()
        self.ridge = RidgeCV(alphas=alphas)

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        """Fit the RidgeCV model. TODO dont use sklearn"""
        X_np = X.detach().cpu().numpy().astype(np.float64)
        y_np = y.detach().cpu().squeeze().numpy().astype(np.float64)
        self.ridge.fit(X_np, y_np)
        return self(X), y

    def forward(self, X: Tensor) -> Tensor:
        """Forward pass through the RidgeCV model. TODO dont use sklearn"""
        X_np = X.detach().cpu().numpy().astype(np.float64)
        y_pred_np = self.ridge.predict(X_np)
        return torch.tensor(y_pred_np, dtype=X.dtype, device=X.device).unsqueeze(1) #TODO unsqueeze???


D = X_train.shape[1]
g1 = torch.Generator()
net = RidgeCVModule()
out_train, _ = net.fit(X_train, y_train)
out = net(X_test)
print(net)

print("rmse test", root_mean_squared_error(y_test, out))
print("rmse train", root_mean_squared_error(y_train, out_train))

RidgeCVModule()
rmse test 0.7801164
rmse train 0.7448608


In [9]:
class SampledResNet(ResNetBase):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 hidden_dim: int,
                 activation_dim: int, #rename to bottleneck dim?
                 n_blocks: int,
                 activation: nn.Module = nn.Tanh(),
                 upsample_module: Literal['dense', 'sampled', 'identity'] = 'dense',
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """A ResNet with sampled layers as bottleneck layers.
        """
        if upsample_module=="dense":
            upsample = Dense(generator, in_dim, hidden_dim)
        elif upsample_module=="sampled":
            upsample = PairSampledLinear(generator, in_dim, hidden_dim, sampling_method)
        elif upsample_module=="identity":
            upsample = Identity()
        else:
            raise ValueError(f"upsample_module must be 'dense', 'sampled' or 'identity'. Given: {upsample_module}")

        blocks = [SampledResBlock(generator, 
                                hidden_dim, 
                                activation_dim,
                                activation,
                                sampling_method
                                ) for _ in range(n_blocks)]
        ridge = RidgeCVModule()
        super(SampledResNet, self).__init__(upsample, blocks, ridge)


D = X_train.shape[1]
g1 = torch.Generator().manual_seed(int(time.time()*10))
net = SampledResNet(g1, D, 100*D, 100*D, 6, upsample_module='sampled', sampling_method='uniform')
out_train, _ = net.fit(X_train, y_train)
out = net(X_test)
print(net)

print("rmse test", root_mean_squared_error(y_test.detach().cpu().numpy(), out.detach().cpu().numpy()))
print("rmse train", root_mean_squared_error(y_train.detach().cpu().numpy(), out_train.detach().cpu().numpy()))
print(net.output_layer.ridge.alpha_)

SampledResNet(
  (upsample): PairSampledLinear(
    (dense): Linear(in_features=11, out_features=1100, bias=True)
  )
  (blocks): ModuleList(
    (0-5): 6 x SampledResBlock(
      (sampled_linear): PairSampledLinear(
        (dense): Linear(in_features=1100, out_features=1100, bias=True)
      )
      (activation): Tanh()
      (upscale): Dense(
        (dense): Linear(in_features=1100, out_features=1100, bias=True)
      )
    )
  )
  (output_layer): RidgeCVModule()
)
rmse test 0.77934337
rmse train 0.7257692
16.68100537200059


In [10]:
class SampledAndActivation(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 out_dim: int, 
                 activation: nn.Module = nn.Tanh(),
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """TODO
        """
        super(SampledAndActivation, self).__init__()
        self.generator = generator
        self.sampled_linear = PairSampledLinear(generator, in_dim, out_dim, sampling_method)
        self.activation = activation
    

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        with torch.no_grad():
            X, y = self.sampled_linear.fit(X, y)
            X = self.activation(X)
            return X, y

    
    def forward(self, X):
        X = self.sampled_linear(X)
        X = self.activation(X)
        return X

D = X_train.shape[1]
g1 = torch.Generator().manual_seed(0)
net = SampledAndActivation(g1, D, 2*D)
net.fit(X_train, y_train)
out = net(X_test)
print_name(out)
print(net)


torch.Size([1470, 22]) out torch.float32
tensor([[-0.9737, -0.4500,  0.9889,  ..., -1.0000,  0.8248,  0.8880],
        [ 0.9553, -0.1542,  0.8126,  ..., -1.0000,  0.9140,  0.9160],
        [ 0.7196,  0.7545, -0.9763,  ..., -0.9693, -0.4671, -0.3324],
        ...,
        [ 0.9890,  0.9525, -0.9999,  ...,  0.1547, -0.9100, -0.8899],
        [ 1.0000,  0.7217, -0.9981,  ..., -1.0000,  0.9407,  0.7410],
        [ 0.9808, -0.3113,  0.9194,  ..., -1.0000,  0.9560,  0.9517]],
       grad_fn=<TanhBackward0>) 

SampledAndActivation(
  (sampled_linear): PairSampledLinear(
    (dense): Linear(in_features=11, out_features=22, bias=True)
  )
  (activation): Tanh()
)


In [11]:
class SampledODEBlock(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 hidden_dim: int, 
                 activation: nn.Module = nn.Tanh(),
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """TODO

        Args:
            generator (torch.Generator): PRNG object.
            hidden_dim (int): Hidden size.
            activation (nn.Module): Activation function.
            sampling_method (str): Pair sampling method. Uniform or gradient-weighted.
        """
        super(SampledODEBlock, self).__init__()
        self.generator = generator
        self.sampled_linear = PairSampledLinear(generator, hidden_dim, hidden_dim, sampling_method)
        self.activation = activation
    

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        with torch.no_grad():
            X0 = X
            X, y = self.sampled_linear.fit(X, y)
            X = self.activation(X)
            return X0 + X, y

    
    def forward(self, X):
        X0 = X
        X = self.sampled_linear(X)
        X = self.activation(X)
        return X0 + X
    
    
D = X_train.shape[1]
g1 = torch.Generator().manual_seed(0)
net = SampledODEBlock(g1, D)
net.fit(X_train, y_train)
out = net(X_test)
print_name(out)
print(net)

torch.Size([1470, 11]) out torch.float32
tensor([[ 6.3263, -0.1300,  1.3389,  ...,  4.2321,  1.4003, 10.2280],
        [ 7.9553,  0.1558,  1.0726,  ...,  4.1209,  1.3236,  9.2552],
        [ 8.3196,  0.8945, -0.2363,  ...,  2.0734,  1.1889, 11.4235],
        ...,
        [ 6.9890,  1.1525, -0.7399,  ...,  2.1400,  0.7127, 12.4013],
        [ 7.1000,  0.9517, -0.7281,  ...,  2.7937, -0.3981,  9.8153],
        [ 7.8808,  0.2687,  1.4994,  ...,  4.2782,  1.3030, 11.5181]],
       grad_fn=<AddBackward0>) 

SampledODEBlock(
  (sampled_linear): PairSampledLinear(
    (dense): Linear(in_features=11, out_features=11, bias=True)
  )
  (activation): Tanh()
)


In [12]:
class SampledEulerODE(ResNetBase):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 hidden_dim: int,
                 n_blocks: int,
                 activation: nn.Module = nn.Tanh(),
                 upsample_module: Literal['dense', 'sampled', 'identity'] = 'dense',
                 sampling_method: Literal['uniform', 'gradient'] = 'gradient'
                 ):
        """A ResNet with sampled layers as bottleneck layers."""
        if upsample_module=="dense":
            upsample = Dense(generator, in_dim, hidden_dim)
        elif upsample_module=="sampled":
            upsample = SampledAndActivation(generator, in_dim, hidden_dim, activation, sampling_method)
        elif upsample_module=="identity":
            upsample = Identity()
        else:
            raise ValueError(f"upsample_module must be 'dense', 'sampled' or 'identity'. Given: {upsample_module}")
        
        blocks = [SampledODEBlock(generator,
                                hidden_dim,
                                activation,
                                sampling_method
                                ) for _ in range(n_blocks)]
        ridge = RidgeCVModule()
        super(SampledEulerODE, self).__init__(upsample, blocks, ridge)


D = X_train.shape[1]
g1 = torch.Generator().manual_seed(int(time.time()*10))
net = SampledEulerODE(g1, D, 100*D, 6, upsample_module='sampled', sampling_method='gradient')
out_train, _ = net.fit(X_train, y_train)
out = net(X_test)
print(net)

print("rmse test", root_mean_squared_error(y_test.detach().cpu().numpy(), out.detach().cpu().numpy()))
print("rmse train", root_mean_squared_error(y_train.detach().cpu().numpy(), out_train.detach().cpu().numpy()))
print(net.output_layer.ridge.alpha_)

SampledEulerODE(
  (upsample): SampledAndActivation(
    (sampled_linear): PairSampledLinear(
      (dense): Linear(in_features=11, out_features=1100, bias=True)
    )
    (activation): Tanh()
  )
  (blocks): ModuleList(
    (0-5): 6 x SampledODEBlock(
      (sampled_linear): PairSampledLinear(
        (dense): Linear(in_features=1100, out_features=1100, bias=True)
      )
      (activation): Tanh()
    )
  )
  (output_layer): RidgeCVModule()
)
rmse test 0.79153246
rmse train 0.6922127
2.1544346900318834


In [16]:
#############################
### Random Feature ResNet ###
#############################
class LinearAndActivation(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 out_dim: int, 
                 activation: nn.Module = nn.Tanh(),
                 ):
        """TODO
        """
        super(LinearAndActivation, self).__init__()
        self.generator = generator
        self.linear = Dense(generator, in_dim, out_dim)
        self.activation = activation
    

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        with torch.no_grad():
            X, y = self.linear.fit(X, y)
            X = self.activation(X)
            return X, y

    
    def forward(self, X):
        X = self.linear(X)
        X = self.activation(X)
        return X
    

class RandomFeatBlock(FittableModule):
    def __init__(self,
                 generator: torch.Generator,
                 hidden_dim: int, 
                 activation: nn.Module = nn.Tanh(),
                 ):
        """TODO

        Args:
            generator (torch.Generator): PRNG object.
            hidden_dim (int): Hidden size.
            activation (nn.Module): Activation function.
        """
        super(RandomFeatBlock, self).__init__()
        self.generator = generator
        self.linear = Dense(generator, hidden_dim, hidden_dim)
        self.activation = activation
    

    def fit(self, X: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
        with torch.no_grad():
            X0 = X
            X, y = self.linear.fit(X, y)
            X = self.activation(X)
            return X0 + X, y

    
    def forward(self, X):
        X0 = X
        X = self.linear(X)
        X = self.activation(X)
        return X0 + X


class RandomFeatureODE(ResNetBase):
    def __init__(self,
                 generator: torch.Generator,
                 in_dim: int,
                 hidden_dim: int,
                 n_blocks: int,
                 activation: nn.Module = nn.Tanh(),
                 ):
        """A ResNet random feature MLP."""
        upsample = LinearAndActivation(generator, in_dim, hidden_dim, activation)
        blocks = [RandomFeatBlock(generator,
                                hidden_dim,
                                activation,
                                ) for _ in range(n_blocks)]
        ridge = RidgeCVModule()
        super(RandomFeatureODE, self).__init__(upsample, blocks, ridge)


D = X_train.shape[1]
g1 = torch.Generator().manual_seed(int(time.time()*10))
net = RandomFeatureODE(g1, D, 100*D, 6)
out_train, _ = net.fit(X_train, y_train)
out = net(X_test)
print(net)

print("rmse test", root_mean_squared_error(y_test.detach().cpu().numpy(), out.detach().cpu().numpy()))
print("rmse train", root_mean_squared_error(y_train.detach().cpu().numpy(), out_train.detach().cpu().numpy()))
print(net.output_layer.ridge.alpha_)

RandomFeatureODE(
  (upsample): LinearAndActivation(
    (linear): Dense(
      (dense): Linear(in_features=11, out_features=1100, bias=True)
    )
    (activation): Tanh()
  )
  (blocks): ModuleList(
    (0-5): 6 x RandomFeatBlock(
      (linear): Dense(
        (dense): Linear(in_features=1100, out_features=1100, bias=True)
      )
      (activation): Tanh()
    )
  )
  (output_layer): RidgeCVModule()
)
rmse test 0.7666988
rmse train 0.6866741
46.41588833612777


# Plot Activations

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_activation(name, activations):
    def hook(model, input, output):
        activations[name] = output.detach()
    return hook


def register_hooks(model, activations):
    for name, layer in model.named_modules():
        print(name)
        if ".dense" not in name:
            layer.register_forward_hook(get_activation(name, activations))



def neuron_distribution_for_each_layer(X_train, y_train, X_test):
    D = X_train.shape[1]
    n_layers = 2
    g1 = torch.Generator().manual_seed(0)
    model = SampledEulerODE(g1, D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
    #model = SampledResNet(g1, D, 10*D, 10*D, n_layers, upsample_module='sampled', sampling_method='gradient')
    model.fit(X_train, y_train)

    activations = {}
    register_hooks(model, activations)
    
    # Forward pass
    model(X_test)
    
    # Plot input data distribution
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(go.Histogram(x=X_train.flatten().cpu().numpy(), nbinsx=50, name='Train', histnorm='probability density', opacity=0.5))
    fig.add_trace(go.Histogram(x=X_test.flatten().cpu().numpy(), nbinsx=50, name='Test', histnorm='probability density', opacity=0.5))
    fig.update_layout(title_text='Input Data Distribution', xaxis_title='Input Feature Value', yaxis_title='Probability Density', barmode='overlay')
    fig.show()

    # Plot activations
    for name, activation in activations.items():
        fig = make_subplots(rows=1, cols=1)
        fig.add_trace(go.Histogram(x=activation.flatten().cpu().numpy(), nbinsx=50, name='Activation', histnorm='probability density', opacity=0.5))
        fig.update_layout(title_text=f'Activations at Layer: {name}', xaxis_title='Activation Value', yaxis_title='Probability Density', barmode='overlay')
        fig.show()


neuron_distribution_for_each_layer(X_train, y_train, X_test)

# Fit on a dataset

In [17]:
def run_allmodels_1dataset(
        generator: torch.Generator,
        X_train: Tensor,
        y_train: Tensor,
        X_test: Tensor,
        y_test: Tensor,
        ):
    
    D = X_train.shape[1]
    hidden_size = 512
    activation_dim = 2*hidden_size
    n_blocks = 3

   # (name, model, kwargs)
    model_list = [
        ["Tabular RidgeCV", RidgeCVModule, {}],

        ["Rand Proj", SampledResNet, 
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "activation_dim": None, 
                "n_blocks": 0, 
                "upsample_module": "dense"}],

        ["1-Layer SampledNet Uniform", SampledResNet,  ####TODO change to SampledAndActivation
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "activation_dim": None, 
                "n_blocks": 0, 
                "upsample_module": "sampled",
                "sampling_method": "uniform"}],

        ["1-Layer SampledNet Gradient", SampledResNet, 
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "activation_dim": None, 
                "n_blocks": 0, 
                "upsample_module": "sampled",
                "sampling_method": "gradient"}],

        ["Sampled ResNet Uniform", SampledResNet,
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "activation_dim": activation_dim, 
                "n_blocks": n_blocks, 
                "upsample_module": "sampled",
                "sampling_method": "uniform"}],

        ["Sampled ResNet Gradient", SampledResNet,
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "activation_dim": activation_dim, 
                "n_blocks": n_blocks, 
                "upsample_module": "sampled",
                "sampling_method": "gradient"}],

        ["Sampled EulerODE Uniform", SampledEulerODE,
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "n_blocks": n_blocks, 
                "upsample_module": "sampled",
                "sampling_method": "uniform"}],

        ["Sampled EulerODE Gradient", SampledEulerODE,
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "n_blocks": n_blocks, 
                "upsample_module": "sampled",
                "sampling_method": "gradient"}],

        ["Random Feature ODE", RandomFeatureODE,
                {"generator": generator, 
                "in_dim": D, 
                "hidden_dim": hidden_size, 
                "n_blocks": n_blocks}],
                
    ]
    
    results = []
    model_names = []
    for name, model, model_args in model_list:
        t0 = time.perf_counter()
        model = model(**model_args).to(X_train.device)
        pred_train, _ = model.fit(X_train, y_train)
        t1 = time.perf_counter()
        pred_test = model(X_test)
        t2 = time.perf_counter()
        rmse_train = root_mean_squared_error(y_train.cpu(), pred_train.cpu()) 
        rmse_test = root_mean_squared_error(y_test.cpu(), pred_test.cpu())

        result = np.array( [rmse_train, rmse_test, t1-t0, t2-t1] )
        results.append( result )
        model_names.append( name )

    return model_names, results



def run_all_experiments(
        dataset_ids: List,
        name_save: str = "PLACEHOLDER",
        ):
#     # Fetch the collection with ID 353
#     collection = openml.study.get_suite(353)
#     dataset_ids = collection.data

    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        print(dataset_id)
        device = "cuda"
        X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id, split_seed=0, device=device)
        generator = torch.Generator(device=device).manual_seed(0)
        results = run_allmodels_1dataset(
            generator, X_train, y_train, X_test, y_test, 
            )
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")

    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "t_fit", "t_feat"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"OpenML_reg_{name_save}.pkl")
    return df

In [18]:
dataset_ids_not_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_not_categorical = [int(x) for x in dataset_ids_not_categorical]
run_all_experiments(dataset_ids_not_categorical)

44973
 1/20 Processed dataset 44973
44975
 2/20 Processed dataset 44975
44980
 3/20 Processed dataset 44980
44981
 4/20 Processed dataset 44981
45402
 5/20 Processed dataset 45402
44994
 6/20 Processed dataset 44994
44957
 7/20 Processed dataset 44957
44970
 8/20 Processed dataset 44970
44959
 9/20 Processed dataset 44959
44960
 10/20 Processed dataset 44960
44963
 11/20 Processed dataset 44963
44976
 12/20 Processed dataset 44976
44977
 13/20 Processed dataset 44977
44983
 14/20 Processed dataset 44983
44964
 15/20 Processed dataset 44964
44965
 16/20 Processed dataset 44965
44978
 17/20 Processed dataset 44978
44969
 18/20 Processed dataset 44969
44972
 19/20 Processed dataset 44972
44971
 20/20 Processed dataset 44971
                        RMSE_test                                       \
      1-Layer SampledNet Gradient 1-Layer SampledNet Uniform Rand Proj   
44973                    0.595157                   0.595159  0.595159   
44975                    0.006491              

Unnamed: 0_level_0,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_test,RMSE_train,...,t_feat,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit,t_fit
Unnamed: 0_level_1,1-Layer SampledNet Gradient,1-Layer SampledNet Uniform,Rand Proj,Random Feature ODE,Sampled EulerODE Gradient,Sampled EulerODE Uniform,Sampled ResNet Gradient,Sampled ResNet Uniform,Tabular RidgeCV,1-Layer SampledNet Gradient,...,Tabular RidgeCV,1-Layer SampledNet Gradient,1-Layer SampledNet Uniform,Rand Proj,Random Feature ODE,Sampled EulerODE Gradient,Sampled EulerODE Uniform,Sampled ResNet Gradient,Sampled ResNet Uniform,Tabular RidgeCV
44973,0.595157,0.595159,0.595159,0.34058,0.265267,0.262816,0.274106,0.273519,0.595158,0.594543,...,0.000989,1.348948,1.153801,1.269893,1.649985,0.974434,1.013453,1.656375,1.336837,0.021568
44975,0.006491,0.006491,0.006491,0.140689,0.011393,0.013024,0.009447,0.010734,0.006491,0.006807,...,0.015274,6.640887,6.86154,7.586293,6.671903,6.953238,6.668116,6.732856,6.719438,0.567887
44980,0.771309,0.771331,0.771314,0.422342,0.425118,0.440368,0.392726,0.457173,0.771311,0.761382,...,0.000885,0.684261,0.643247,0.704401,0.69529,0.67937,0.64101,0.69614,0.718928,0.023043
44981,0.904397,0.904818,0.904248,0.906131,0.905082,0.904194,0.903146,0.903209,0.904478,0.917619,...,0.001368,0.683061,0.706151,0.71502,0.633572,0.672336,0.689704,0.699537,0.759174,0.061516
45402,0.707004,0.706447,0.706434,0.555939,0.593257,0.596652,0.593615,0.615447,0.70669,0.63666,...,0.000312,0.357183,0.384122,0.352507,0.336089,0.387642,0.345023,0.350022,0.42107,0.011551
44994,0.29674,0.296623,0.29676,0.215857,0.225381,0.226724,0.311644,0.256188,0.296725,0.298746,...,0.000326,0.169758,0.177873,0.225159,0.186932,0.184228,0.133652,0.192889,0.188467,0.021471
44957,0.674052,0.674364,0.674497,0.332879,0.295786,0.38202,0.291586,0.34677,0.674484,0.689486,...,0.000398,0.226856,0.258456,0.258957,0.265192,0.199013,0.242993,0.237008,0.225692,0.003265
44970,0.667694,0.666439,0.666413,0.609266,0.632369,0.632836,0.6255,0.627493,0.666021,0.623745,...,0.000282,0.169073,0.210603,0.192638,0.164729,0.1735,0.174261,0.162065,0.250307,0.002859
44959,0.542102,0.542124,0.542091,0.362881,0.33121,0.347704,0.346756,0.360977,0.542088,0.587679,...,0.000287,0.196756,0.218997,0.197776,0.196601,0.179508,0.181714,0.169238,0.247462,0.003169
44960,0.30528,0.306052,0.304332,0.104895,0.112107,0.155929,0.099897,0.114104,0.304327,0.287332,...,0.000273,0.20735,0.18688,0.118883,0.161881,0.210007,0.240221,0.217899,0.166685,0.002824


In [22]:
df_reg = pd.read_pickle("OpenML_reg_PLACEHOLDER.pkl")
df_reg["RMSE_test"].mean()

1-Layer SampledNet Gradient    0.582503
1-Layer SampledNet Uniform     0.582828
Rand Proj                      0.578715
Random Feature ODE             0.445731
Sampled EulerODE Gradient      0.434101
Sampled EulerODE Uniform       0.447811
Sampled ResNet Gradient        0.437458
Sampled ResNet Uniform         0.446454
Tabular RidgeCV                0.579533
dtype: float64

In [21]:
df_reg["RMSE_test"].rank(axis=1).mean()

1-Layer SampledNet Gradient    6.90
1-Layer SampledNet Uniform     7.45
Rand Proj                      6.60
Random Feature ODE             3.25
Sampled EulerODE Gradient      3.20
Sampled EulerODE Uniform       3.80
Sampled ResNet Gradient        3.15
Sampled ResNet Uniform         4.00
Tabular RidgeCV                6.65
dtype: float64

In [None]:
df_reg["RMSE_train"].mean()

In [None]:
df_reg["RMSE_train"].rank(axis=1).mean()

In [None]:
df_reg["RMSE_test"].rank(axis=1)