In [6]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

#from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from preprocessing.stream_transforms import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from utils.utils import print_name, print_shape
from models import ResNet, NeuralEulerODE, RidgeCVModule, E2EResNet, StagewiseRandFeatBoostRegression

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [7]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )
    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

(4177, 8)
(4177, 1)
 1/35 Processed dataset 44956: abalone
(1503, 5)
(1503, 1)
 2/35 Processed dataset 44957: airfoil_self_noise
(2043, 7)
(2043, 1)
 3/35 Processed dataset 44958: auction_verification
(1030, 8)
(1030, 1)
 4/35 Processed dataset 44959: concrete_compressive_strength
(45730, 9)
(45730, 1)
 5/35 Processed dataset 44963: physiochemical_protein
(21263, 81)
(21263, 1)
 6/35 Processed dataset 44964: superconductivity
(1059, 116)
(1059, 1)
 7/35 Processed dataset 44965: geographical_origin_of_music
(1066, 10)
(1066, 1)
 8/35 Processed dataset 44966: solar_flare
(11934, 14)
(11934, 1)
 9/35 Processed dataset 44969: naval_propulsion_plant
(4898, 11)
(4898, 1)
 10/35 Processed dataset 44971: white_wine
(1599, 11)
(1599, 1)
 11/35 Processed dataset 44972: red_wine
(10000, 12)
(10000, 1)
 12/35 Processed dataset 44973: grid_stability
(68784, 18)
(68784, 1)
 13/35 Processed dataset 44974: video_transcoding
(72000, 48)
(72000, 1)
 14/35 Processed dataset 44975: wave_energy
(48933, 21)

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41021,Moneyball,1232,15,0.303571,374,True
44956,abalone,4177,9,0.006703,28,True
44957,airfoil_self_noise,1503,6,0.968729,1456,False
44958,auction_verification,2043,8,0.998042,2039,True
44959,concrete_compressive_strength,1030,9,0.91068,938,False
44960,energy_efficiency,768,9,0.764323,587,False
44962,forest_fires,517,13,0.485493,251,True
44963,physiochemical_protein,45730,10,0.347759,15903,False
44964,superconductivity,21263,82,0.141419,3007,False
44965,geographical_origin_of_music,1059,117,0.029273,31,False


In [8]:
dataset_ids_no_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_no_categorical = sorted([int(x) for x in dataset_ids_no_categorical])
len(dataset_ids_no_categorical)

20

# Download single dataset

In [9]:
def np_load_openml_dataset(dataset_id, 
                        normalize_X:bool = True,
                        normalize_y:bool = True,
                        ) -> Tuple[np.ndarray, np.ndarray]:
    # Fetch dataset from OpenML by its ID
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, categorical_indicator, attribute_names = dataset.get_data()
    y = np.array(df.pop(dataset.default_target_attribute)).astype(np.float32)
    X = np.array(df).astype(np.float32)

    #normalize
    if normalize_X:
        X = X - X.mean(axis=0, keepdims=True)
        X = X / (X.std(axis=0, keepdims=True) + 1e-5)
        X = np.clip(X, -3, 3)
    if normalize_y:
        y = y - y.mean()
        y = y / (y.std() + 1e-5)
        y = np.clip(y, -3, 3)

    return (X, y)

#dataset_id = 44971  # Replace with the dataset ID you want
dataset_id = 44971 #44970
X, y = np_load_openml_dataset(dataset_id, False, False)

# Optuna

The procedure is the following: Take in tabular dataset X shape (N, D) and y shape (N, d). 
Create a 5 fold (stratified? how does this work with regression targets) cross validation.

From each 80% train set, run a specified optuna objective which itself uses an inner 5-foldCV,
to obtain hyperparameters. 

Use these hyperparams to train on the full 80% train set, and test on the 20% test set.

Repeat for each fold

In [10]:
cv_seed = 42
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb
import optuna


def objective_xgboost_cv_reg(
        trial, 
        X_train: np.ndarray, 
        y_train: np.ndarray, 
        k_folds: int,
        cv_seed: int,
        ):
    params = {
        "random_state": 42,
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }

    inner_cv = KFold(n_splits=k_folds, shuffle=True, random_state=cv_seed)
    rmse_list = []

    for inner_train_idx, inner_valid_idx in inner_cv.split(X_train):
        X_inner_train, X_inner_valid = X_train[inner_train_idx], X_train[inner_valid_idx]
        y_inner_train, y_inner_valid = y_train[inner_train_idx], y_train[inner_valid_idx]

        model = xgb.XGBRegressor(**params)
        model.fit(X_inner_train, y_inner_train)

        preds = model.predict(X_inner_valid)
        rmse = root_mean_squared_error(y_inner_valid, preds)
        rmse_list.append(rmse)

    return np.mean(rmse_list)



def evaluate_xgboost_kfoldcv(
        X: np.ndarray, 
        y: np.ndarray, 
        k_folds: int = 5, 
        cv_seed: int = 42,
        n_optuna_trials: int = 50,
    ):
    """Evaluates an XGBoost model using k-fold cross-validation.
    Hyperparameters are tuned with Optuna using an inner k-fold CV.
    The model is then trained on the whole fold train set and evaluated 
    on the fold test set.

    Returns the train RMSE, test RMSE, chosen params, training times, and test set inference times
    for each fold.
    """
    outer_cv = KFold(n_splits=k_folds, shuffle=True, random_state=cv_seed)
    outer_train_rmse_scores = []
    outer_test_rmse_scores = []
    chosen_params = []
    fit_times = []
    transform_times = []

    for train_idx, test_idx in outer_cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        #hyperparameter tuning with Optuna
        study = optuna.create_study(direction="minimize", )
        objective = lambda trial: objective_xgboost_cv_reg(trial, X_train, y_train, k_folds, cv_seed)
        study.optimize(objective, n_trials=n_optuna_trials)

        #fit model with optimal hyperparams
        t0 = time.perf_counter()
        model = xgb.XGBRegressor(**study.best_params)
        model.fit(X_train, y_train)

        #predict and evaluate
        t1 = time.perf_counter()
        preds_train = model.predict(X_train)
        rmse_train = root_mean_squared_error(y_train, preds_train)
        preds_test = model.predict(X_test)
        rmse_test = root_mean_squared_error(y_test, preds_test)
        t2 = time.perf_counter()

        outer_train_rmse_scores.append(rmse_train)
        outer_test_rmse_scores.append(rmse_test)
        chosen_params.append(study.best_params.copy())
        fit_times.append(t1-t0)
        transform_times.append(t2-t1)


    return (np.array(outer_train_rmse_scores),
            np.array(outer_test_rmse_scores),
            chosen_params,
            np.array(fit_times),
            np.array(transform_times))

In [7]:
def run_all_openMLreg_xgboost(
        dataset_ids: List,
        name_save: str = "XGBoost_OpenML_reg.pkl",
        k_folds: int = 5, 
        cv_seed: int = 42,
        n_optuna_trials: int = 2,
        ):
    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        X, y = np_load_openml_dataset(dataset_id)
        results = evaluate_xgboost_kfoldcv(X, y, k_folds, cv_seed, n_optuna_trials)
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")

    # Save results
    attributes = ["RMSE_train", "RMSE_test", "hyperparams", "t_fit", "t_inference"]
    data_list = []
    for dataset_name, results in experiments.items():
        dataset_data = {}
        print(dataset_name)
        print(results)
        for i, attrib in enumerate(attributes):
            dataset_data[(attrib, "XGBoost")] = [results[i]]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(name_save)

run_all_openMLreg_xgboost(dataset_ids_no_categorical[0:2], "XGBoost_OpenML_reg.pkl")

[I 2024-11-23 16:21:36,416] A new study created in memory with name: no-name-3fed73c0-2e7c-4ac2-a76e-ad454218f97f
[I 2024-11-23 16:21:36,970] Trial 0 finished with value: 0.24816958606243134 and parameters: {'lambda': 2.602558873138932, 'learning_rate': 0.20856213756415415, 'n_estimators': 253, 'max_depth': 7, 'subsample': 0.6254345847349718, 'colsample_bytree': 0.6985599798630221}. Best is trial 0 with value: 0.24816958606243134.
[I 2024-11-23 16:21:37,310] Trial 1 finished with value: 0.6025902032852173 and parameters: {'lambda': 0.03896510883216185, 'learning_rate': 0.016584704653584373, 'n_estimators': 133, 'max_depth': 8, 'subsample': 0.6686722986166862, 'colsample_bytree': 0.5254499468821247}. Best is trial 0 with value: 0.24816958606243134.
[I 2024-11-23 16:21:37,421] A new study created in memory with name: no-name-bfeed2c9-d97a-40f1-8414-832fcf2b6c41
[I 2024-11-23 16:21:37,706] Trial 0 finished with value: 0.3610062003135681 and parameters: {'lambda': 0.7647218975959239, 'lear

 1/2 Processed dataset 44957


[I 2024-11-23 16:21:41,745] Trial 0 finished with value: 0.30283084511756897 and parameters: {'lambda': 4.39561835804349, 'learning_rate': 0.017625826053495358, 'n_estimators': 291, 'max_depth': 6, 'subsample': 0.9883916082293243, 'colsample_bytree': 0.6993587354448407}. Best is trial 0 with value: 0.30283084511756897.
[I 2024-11-23 16:21:42,082] Trial 1 finished with value: 0.27498066425323486 and parameters: {'lambda': 1.4800796813968342, 'learning_rate': 0.08115209041513519, 'n_estimators': 232, 'max_depth': 4, 'subsample': 0.9845188546443302, 'colsample_bytree': 0.8935350211764244}. Best is trial 1 with value: 0.27498066425323486.
[I 2024-11-23 16:21:42,183] A new study created in memory with name: no-name-f592c126-946b-4f81-b11d-2acd23ccfe48
[I 2024-11-23 16:21:42,748] Trial 0 finished with value: 0.3172334134578705 and parameters: {'lambda': 0.9328108245910985, 'learning_rate': 0.0761017209854222, 'n_estimators': 77, 'max_depth': 9, 'subsample': 0.6577082518839107, 'colsample_byt

 2/2 Processed dataset 44959
44957
(array([0.084, 0.058, 0.07 , 0.167, 0.208], dtype=float32), array([0.24 , 0.242, 0.231, 0.309, 0.35 ], dtype=float32), [{'lambda': 2.602558873138932, 'learning_rate': 0.20856213756415415, 'n_estimators': 253, 'max_depth': 7, 'subsample': 0.6254345847349718, 'colsample_bytree': 0.6985599798630221}, {'lambda': 0.04006088660291853, 'learning_rate': 0.10922985130306125, 'n_estimators': 109, 'max_depth': 10, 'subsample': 0.9794250868382514, 'colsample_bytree': 0.6271267886035236}, {'lambda': 2.076169892719885, 'learning_rate': 0.2497722601612211, 'n_estimators': 224, 'max_depth': 6, 'subsample': 0.9563702179506413, 'colsample_bytree': 0.9197870764997941}, {'lambda': 2.457794387912405, 'learning_rate': 0.07318034751688561, 'n_estimators': 65, 'max_depth': 8, 'subsample': 0.9131049499556777, 'colsample_bytree': 0.9117058456176644}, {'lambda': 0.0625587894638664, 'learning_rate': 0.038386876671209436, 'n_estimators': 132, 'max_depth': 7, 'subsample': 0.580243

In [None]:
# or do i want a json/big array?     results[rmse_test][model][dataset][fold]. could work minus the parameters

In [11]:
df_reg = pd.read_pickle("XGBoost_OpenML_reg.pkl")
df_reg["RMSE_train"]#.mean().sort_values()

Unnamed: 0,XGBoost
44957,"[0.0838222, 0.058214106, 0.0700473, 0.16652176..."
44959,"[0.14165348, 0.095980756, 0.14441806, 0.088690..."


In [12]:
df_reg["RMSE_test"]#.mean().sort_values()

Unnamed: 0,XGBoost
44957,"[0.2402587, 0.24241394, 0.2314812, 0.30912963,..."
44959,"[0.27749225, 0.33525988, 0.27638143, 0.2324987..."


# Optuna PyTorch

In [47]:
def pytorch_load_openml_dataset(
        dataset_id, 
        normalize_X:bool = True,
        normalize_y:bool = True,
        device: str = "cpu",
        ) -> Tuple[Tensor, Tensor]:
    X, y = np_load_openml_dataset(dataset_id, normalize_X, normalize_y)
    X = torch.from_numpy(X).to(device)
    y = torch.from_numpy(y).to(device)

    if y.dim() == 1:
        y = y.unsqueeze(1)
    
    return X, y

In [None]:
from models import GradientRandFeatBoostRegression

###################################################################  |
#####  Boilerplate code for tabular PyTorch model evaluation  #####  |
#####  with Optuna hyperparameter tuning inner kfoldcv        #####  |
###################################################################  V


def get_pytorch_optuna_cv_rmse_objective(
        trial,
        ModelClass: Callable,
        get_optuna_params: Callable,
        X_train: Tensor, 
        y_train: Tensor, 
        k_folds: int,
        cv_seed: int,
        ):
    """The objective to be minimized in Optuna's 'study.optimize(objective, n_trials)' function."""
    
    params = get_optuna_params(trial)

    inner_cv = KFold(n_splits=k_folds, shuffle=True, random_state=cv_seed)
    rmse_list = []
    for inner_train_idx, inner_valid_idx in inner_cv.split(X_train):
        X_inner_train, X_inner_valid = X_train[inner_train_idx], X_train[inner_valid_idx]
        y_inner_train, y_inner_valid = y_train[inner_train_idx], y_train[inner_valid_idx]

        model = ModelClass(**params)
        model.fit(X_inner_train, y_inner_train)

        preds = model(X_inner_valid)
        rmse = torch.sqrt(nn.functional.mse_loss(y_inner_valid, preds))
        rmse_list.append(rmse.item())

    return np.mean(rmse_list)



def evaluate_pytorch_model_kfoldcv(
        ModelClass : Callable,
        get_optuna_params : Callable,
        X: Tensor,
        y: Tensor,
        k_folds: int,
        cv_seed: int,
        n_optuna_trials: int,
        device: Literal["cpu", "cuda"],
        ):
    """
    Evaluates a PyTorch model using k-fold cross-validation,
    with an inner Optuna hyperparameter tuning loop for each fold.
    The model is then trained on the whole fold train set and evaluated
    on the fold test set.

    Inner and outer kFoldCV use the same number of folds.
    """
    outer_cv = KFold(n_splits=k_folds, shuffle=True, random_state=cv_seed)
    outer_train_rmse_scores = []
    outer_test_rmse_scores = []
    chosen_params = []
    fit_times = []
    transform_times = []

    for train_idx, test_idx in outer_cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        #hyperparameter tuning with Optuna
        study = optuna.create_study(direction="minimize", )
        objective = lambda trial: get_pytorch_optuna_cv_rmse_objective(
            trial, ModelClass, get_optuna_params, X_train, y_train, k_folds, cv_seed
            )
        study.optimize(objective, n_trials=n_optuna_trials)

        #fit model with optimal hyperparams
        t0 = time.perf_counter()
        print("best params", study.best_params)
        model = ModelClass(**study.best_params).to(device)
        model.fit(X_train, y_train)

        #predict and evaluate
        t1 = time.perf_counter()
        preds_train = model(X_train)
        rmse_train = torch.sqrt(nn.functional.mse_loss(y_train, preds_train))
        preds_test = model(X_test)
        rmse_test = torch.sqrt(nn.functional.mse_loss(y_test, preds_test))
        t2 = time.perf_counter()

        outer_train_rmse_scores.append(rmse_train.item())
        outer_test_rmse_scores.append(rmse_test.item())
        chosen_params.append(study.best_params.copy())
        fit_times.append(t1-t0)
        transform_times.append(t2-t1)
    
    return (np.array(outer_train_rmse_scores),
            np.array(outer_test_rmse_scores),
            chosen_params,
            np.array(fit_times),
            np.array(transform_times))

    

##############################################################  |
##### Create "evalute_MODELHERE" function for each model #####  |
##############################################################  V


def evaluate_GRFBoost(
        X: Tensor,
        y: Tensor,
        k_folds: int,
        cv_seed: int,
        n_optuna_trials: int,
        device: Literal["cpu", "cuda"],
        ):
    ModelClass = GradientRandFeatBoostRegression
    get_optuna_params = lambda trial : {
        "seed": trial.suggest_int("seed", 42, 42),                              # Fixed value
        "hidden_dim": trial.suggest_int("hidden_dim", X.size(1), 128, log=True),
        "bottleneck_dim": trial.suggest_int("bottleneck_dim", 64, 128, log=True),
        "out_dim": trial.suggest_int("out_dim", y.size(1), y.size(1)),          # Fixed value
        "n_layers": trial.suggest_int("n_layers", 1, 50, log=True),
        "l2_reg": trial.suggest_float("l2_reg", 1e-6, 0.1, log=True),
        "boost_lr": trial.suggest_float("boost_lr", 0.1, 1.0, log=True),
        "feature_type": trial.suggest_categorical("feature_type", ["SWIM"]),    # Fixed value
        "upscale": trial.suggest_categorical("upscale", ["dense"]),             # Fixed value
    }

    return evaluate_pytorch_model_kfoldcv(
        ModelClass, get_optuna_params,
        X, y, k_folds, cv_seed, n_optuna_trials, device,
        )


# how i will actually run the models

In [58]:
def run_all_openMLreg_with_model(
        dataset_ids: List,
        evaluate_model_func: Callable,
        name_save: str, #"GRFBoost_OpenML_reg.pkl",
        k_folds: int = 5,
        cv_seed: int = 42,
        n_optuna_trials: int = 2,
        device: Literal["cpu", "cuda"] = "cuda",
        ):
    # Fetch and process each dataset
    experiments = {}
    for i, dataset_id in enumerate(dataset_ids):
        X, y = pytorch_load_openml_dataset(dataset_id)
        results = evaluate_model_func(
            X, y, k_folds, cv_seed, n_optuna_trials, device
            )
        experiments[dataset_id] = results
        print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset_id}")
    
    # Save results
    attributes = ["RMSE_train", "RMSE_test", "hyperparams", "t_fit", "t_inference"]
    data_list = []
    for dataset_name, results in experiments.items():
        dataset_data = {}
        for i, attrib in enumerate(attributes):
            dataset_data[(attrib, "GRFBoost")] = [results[i]]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(name_save)

In [59]:
run_all_openMLreg_with_model(
    dataset_ids_no_categorical[0:2], 
    evaluate_GRFBoost, 
    "GRFBoost_OpenML_reg.pkl",
    )

[I 2024-11-23 18:13:15,693] A new study created in memory with name: no-name-ef04ed0e-32ae-40e2-9365-3c3a903504f5
[I 2024-11-23 18:13:15,981] Trial 0 finished with value: 0.5036354780197143 and parameters: {'hidden_dim': 6, 'bottleneck_dim': 151, 'n_layers': 9, 'l2_reg': 0.03237018578943198, 'boost_lr': 0.11095071061899496}. Best is trial 0 with value: 0.5036354780197143.
[I 2024-11-23 18:13:17,304] Trial 1 finished with value: 0.4271220028400421 and parameters: {'hidden_dim': 10, 'bottleneck_dim': 150, 'n_layers': 44, 'l2_reg': 0.0020083719022068363, 'boost_lr': 0.8269997726713386}. Best is trial 1 with value: 0.4271220028400421.


best params {'hidden_dim': 10, 'bottleneck_dim': 150, 'n_layers': 44, 'l2_reg': 0.0020083719022068363, 'boost_lr': 0.8269997726713386}


TypeError: GradientRandFeatBoostRegression.__init__() missing 1 required positional argument: 'generator'