# PMLBmini experiments

This notebook runs the PMLBmini experiments, and compares RANDOM FEATURE BOOSTING and END2END to the saved PMLBmini models

NOTE that we assume tabmini is installed in the cwd https://github.com/RicardoKnauer/TabMini 

Should take no more than 30 minutes to run this notebook, ie run all models and datasets sequentially on a single CPU core

In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet

In [2]:
class Config:
    save_dir = Path.cwd() / "results" / "PMLBmini"

In [3]:
##############################################
#####      Equal/Random Guessing        ######
##############################################


class EqualGuessing(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        """Guess probabilty 0.5 for each class"""
        # Guess [0.5, 0.5]
        return np.ones((X.shape[0], 2)) * 0.5

    def decision_function(self, X):
        # Get the probabilities from predict_proba
        proba = self.predict_proba(X)
        # Calculate the log of ratios for binary classification
        decision = np.log((proba[:, 1] + 1e-10) / (proba[:, 0] + 1e-10))
        return decision
    

##################################################
############# Grid Search wrapper    #############
############# for custom estimators  #############
##################################################


class WrapperGridSearch(BaseEstimator, ClassifierMixin):
    def __init__(self, param_grid:Dict[str, List], out_name = "n_classes", verbose=0):
        self.param_grid = param_grid
        self.out_name = out_name # 'n_classes' for GBRFRBoost, 'out_dim' for E2E_MLP_ResNet
        self.verbose = verbose


    def fit(self, X, y):
        """
        Performs a stratified 3-fold CV for hyperparameter tuning
        based on self.param_grid, and fits the best model on the whole dataset
        """
        # MinMaxScaler and convert to torch
        self.classes_ = np.unique(y)
        N, D = X.values.shape
        self.scaler = MinMaxScaler()
        X = self.scaler.fit_transform(X.values)
        X = torch.tensor(X).float()
        y = torch.tensor(y.values)[..., None].float()

        # Perform grid search with k-fold cross-validation
        param_grid = {**self.param_grid, **{"seed": [42]}, **{"in_dim": [D]}, **{self.out_name: [2]}}
        if self.out_name == 'out_dim': # end2end has other param names
            param_grid["batch_size"] = [max(int(N*4/9-1), self.param_grid["batch_size"][0])] # otherwise we can get a batch size of 1, error with batch norm
            param_grid[self.out_name] = [1]
        estimator = SKLearnWrapper()
        grid_search = GridSearchCV(
            estimator=estimator,
            param_grid= param_grid,
            cv=StratifiedKFold(n_splits=3), #3-fold since PMLBmini uses 3-fold
            verbose=self.verbose,
        )
        grid_search.fit(X, y)

        # fit best model
        best_model = grid_search.best_estimator_
        print("Best params:", grid_search.best_params_)
        best_model.set_model_eval()
        self.model = best_model
        return self


    def predict_proba(self, X):
        X = self.scaler.fit_transform(X.values)
        X = torch.tensor(X).float()
        proba_0 = torch.nn.functional.sigmoid(self.model.predict(X)).cpu().detach().numpy()
        return np.concatenate((1 - proba_0, proba_0), axis=1)


    def decision_function(self, X):
        proba = self.predict_proba(X)
        decision = np.log((proba[:, 1] + 1e-10) / (proba[:, 0] + 1e-10))
        return decision

In [4]:
#########################
#### Run given model ####
#########################


def test_on_PMLBmini(
        estimator: BaseEstimator,
        estimator_name: str, 
        dataset_save_path = Config.save_dir / 'PMLBmini_dataset.pkl',
        other_saved_methods = {}, #{'XGBoost'},
        ):
    
    #download dataset, cache it
    if not os.path.exists(dataset_save_path):
        print("Dataset not found, downloading")
        dataset = tabmini.load_dataset(reduced=False)
        os.makedirs(Config.save_dir, exist_ok=True)
        with open(dataset_save_path, 'wb') as f:
            pickle.dump(dataset, f)
    else:
        print("Dataset found, loading")
        with open(dataset_save_path, 'rb') as f:
            dataset = pickle.load(f)

    # Perform the comparison
    test_results, train_results = tabmini.compare(
        estimator_name,
        estimator,
        dataset,
        working_directory = Config.save_dir,
        scoring_method="roc_auc",
        methods= other_saved_methods,
        cv=5,
        time_limit=3600,
        device="cpu",
        n_jobs=1,
    )
    return train_results, test_results

In [None]:
train_guessing_and_xgboost, test_guessing_and_xgboost = test_on_PMLBmini(
    EqualGuessing(),
    'EqualGuessing',
    other_saved_methods={"XGBoost"},
    )

In [None]:
# train_logistic, test_logistic = test_on_PMLBmini(
#     WrapperGridSearch(param_grid = {
#                 'modelClass': [GradientRFRBoostClassifier],
#                 'l2_cls': [1, 0.1, 0.001, 0.0001],
#                 'n_layers': [0],
#                 'upscale_type': ["identity"],
#                 'use_batchnorm': [False],
#                 'lbfgs_max_iter': [300],
#                 'lbfgs_lr': [1.0],
#             },
#             verbose=3),
#     'Logistic (mine)',
#     )

In [7]:
# TODO fix this tomorrow... TODO TODO TODO TODO maybe enable batch norm and do grid search

In [8]:
# train_GRFRBoost_exp, test_GRFRBoost_exp = test_on_PMLBmini(
#     WrapperGridSearch(param_grid = {
#                 'modelClass': [GradientRFRBoostClassifier],
#                 'l2_cls': [0.001],
#                 'l2_ghat': [0.01],
#                 'n_layers': [2],
#                 'randfeat_xt_dim': [512],
#                 'randfeat_x0_dim': [512],
#                 'hidden_dim': [128],
#                 # 'SWIM_scale': [1.0],
#                 'use_batchnorm': [False],
#             }),
#     'GRFRBoost exp',
#     other_saved_methods={},
#     )

In [None]:
train_GRFRBoostID, test_GRFRBoostID = test_on_PMLBmini(
    WrapperGridSearch(param_grid = {
                'modelClass': [GradientRFRBoostClassifier],
                'l2_cls': [10, 1, 0.1, 0.001, 0.0001],
                'l2_ghat': [10, 1, 0.1, 0.01],
                'n_layers': [1],
                'randfeat_xt_dim': [512],
                'randfeat_x0_dim': [512],
                # 'hidden_dim': [512],
                # 'upscale_type': ["identity"],
                # 'feature_type': ["iid"],
                'upscale_type': ["identity"],
                'feature_type': ["SWIM"],
                # 'hidden_dim': [128],
                # 'SWIM_scale': [1.0],
                'use_batchnorm': [False],
                'boost_lr': [1.0],
                #'activation': ['relu'],
                
            },
            verbose=3,),
    'GRFRBoostID',
    other_saved_methods={},
    )

In [None]:
train_GRFRBoost, test_GRFRBoost = test_on_PMLBmini(
    WrapperGridSearch(param_grid = {
                'modelClass': [GradientRFRBoostClassifier],
                'l2_cls': [10, 1, 0.1, 0.001, 0.0001],
                'l2_ghat': [10, 1, 0.1, 0.01],
                'n_layers': [1],
                'randfeat_xt_dim': [512],
                'randfeat_x0_dim': [512],
                'hidden_dim': [512],
                # 'upscale_type': ["identity"],
                # 'feature_type': ["iid"],
                'upscale_type': ["iid"],
                'iid_scale': [10.0],
                'feature_type': ["SWIM"],
                # 'hidden_dim': [128],
                # 'SWIM_scale': [1.0],
                'use_batchnorm': [False],
                'boost_lr': [1.0],
                #'activation': ['relu'],
                
            },
            verbose=3,),
    'GRFRBoost',
    other_saved_methods={},
    )

In [None]:
# Random feature Neural Network
train_RFNN, test_RFNN = test_on_PMLBmini(
    WrapperGridSearch(param_grid = {
                'modelClass': [GradientRFRBoostClassifier],
                'l2_cls': [1, 0.1, 0.001, 0.0001],
                'hidden_dim': [512],
                'n_layers': [0],
            }),
    'RFNN',
    other_saved_methods={},
    )

In [None]:
# train_GRFRBoost.join(test_RFNN).join(test_GRFRBoost, lsuffix='_train', rsuffix='_test').join(test_logistic)

In [None]:
train_E2E, test_E2E = test_on_PMLBmini(
    WrapperGridSearch(param_grid = {
            'modelClass': [End2EndMLPResNet],
            'lr': np.logspace(-2, -1, 2),
            'hidden_dim': [32],
            'bottleneck_dim': [32],
            'n_blocks': [2],
            'loss': ["bce"],
            'n_epochs': [30],
            'end_lr_factor': [0.01],
            'weight_decay': [0.00001],
            'batch_size': [32],
            'activation': [nn.ReLU()],
            },
            out_name='out_dim',),
    'E2E_MLP_ResNet',
    other_saved_methods={},
    )

In [None]:
# Load the CSV and set the index
saved_results = pd.read_csv('https://raw.githubusercontent.com/RicardoKnauer/TabMini/master/plotting/results/test_scores_wide_3600.csv', delimiter=";", index_col=0)
saved_results.index.name = None
saved_results

In [50]:
combined_results = saved_results.copy()
# combined_results = pd.read_csv(Config.save_dir / "combined_results.csv", index_col=0)
for df in [test_guessing_and_xgboost, 
           test_RFNN, 
           #test_E2E, 
           test_GRFRBoost,
           #test_logistic,
           ]:
    combined_results = combined_results.join(df, how='inner')
combined_results = combined_results.round(2) # since PMLBmini's resulst are rounded to 2 decimals, for fair comparison
combined_results

#save
combined_results.to_csv(Config.save_dir / "combined_results.csv")


In [None]:
combined_results = pd.read_csv(Config.save_dir / "combined_results.csv", index_col=0)
combined_results

In [None]:
#combined_results[["GRFRBoost (ours)", "Logistic (mine)", "RFNN"]]

In [None]:
import matplotlib.pyplot as plt

# Generate the plot
plot = plot_critical_difference(combined_results.values,
                                combined_results.columns.tolist(), 
                                alpha=0.05, 
                                lower_better=False)

# Retrieve the figure and axes from the plot
fig = plot[0].figure
ax = plot[0]

# Adjust figure size
fig.set_size_inches(6, 3)

# Adjust layout
fig.tight_layout()

# Save the figures
fig.savefig(Config.save_dir / "PMLBmini_critical_difference.eps", bbox_inches='tight')
fig.savefig(Config.save_dir / "PMLBmini_critical_difference.png", bbox_inches='tight')

In [None]:
import pandas as pd
import numpy as np

def create_latex_table(df):
    table = """
\\begin{table}[t]
\\caption{Test accuracies on the concentric circles task.}
\\label{tab:concentric-circles}
\\vskip 0.15in
\\begin{center}
\\begin{small}
\\begin{sc}
\\begin{tabular}{lcc}
\\toprule
Model & Mean Acc & Std Dev \\\\
\\midrule
"""
    for model_name in df.columns:
        accs = df[model_name]
        mean_acc = np.mean(accs)
        std_acc = np.std(accs)
        table += f"{model_name} & {mean_acc:.4f} & {std_acc:.4f} \\\\\n"
    
    table += """
\\bottomrule
\\end{tabular}
\\end{sc}
\\end{small}
\\end{center}
\\vskip -0.1in
\\end{table}
"""
    return table

# Example usage
# Assuming `results_df` is your pandas DataFrame
latex_table = create_latex_table(combined_results)
print(latex_table)

# experiment on single

In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from models.base import LogisticRegression
from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet

class Config:
    save_dir = Path.cwd() / "results" / "PMLBmini"

In [None]:
# experiment on a single dataset

#download dataset, cache it
dataset_save_path = Config.save_dir / 'PMLBmini_dataset.pkl'
if not os.path.exists(dataset_save_path):
    print("Dataset not found, downloading")
    dataset = tabmini.load_dataset(reduced=False)
    os.makedirs(Config.save_dir, exist_ok=True)
    with open(dataset_save_path, 'wb') as f:
        pickle.dump(dataset, f)
else:
    print("Dataset found, loading")
    with open(dataset_save_path, 'rb') as f:
        dataset = pickle.load(f)

In [3]:
from sklearn.model_selection import train_test_split

# 10: parity5
X, y = dataset["analcatdata_asbestos"]
X = torch.tensor(X.values).float()
y = torch.tensor(y.values)[..., None].float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# model = LogisticRegression(
#     n_classes=2,
#     l2_lambda=0.0001,
#     max_iter = 300,
# )
np.random.seed(42)
torch.manual_seed(42)

model = GradientRFRBoostClassifier(
    in_dim=X.shape[1],
    n_classes=2,
    l2_cls=0.1,
    l2_ghat=10000,
    n_layers=1,
    randfeat_xt_dim=128,
    randfeat_x0_dim=128,
    hidden_dim=128,
    upscale_type="SWIM",
    feature_type="SWIM",
    use_batchnorm=False,
    boost_lr=1.0,
    activation="relu"
)
model.fit(X_train, y_train)

from sklearn.metrics import roc_auc_score

logits = model(X_test)
print("logits", logits)
probs = nn.functional.sigmoid(logits)
print("out and y", torch.cat([logits, y_test], dim=1))
print("binary class pred and y", torch.cat([probs > 0.5, y_test], dim=1))
auc = roc_auc_score(y_test.numpy(), probs.detach().numpy())
print("AUC:", auc)

#train
logits = model(X_train)
probs = nn.functional.sigmoid(logits)
auc = roc_auc_score(y_train.numpy(), probs.detach().numpy())
print("train AUC:", auc)