# PMLBmini experiments

This notebook runs the PMLBmini experiments, and compares RANDOM FEATURE BOOSTING and END2END to the saved PMLBmini models

NOTE that we assume tabmini is installed in the cwd https://github.com/RicardoKnauer/TabMini 

Should take no more than 30 minutes to run this notebook, ie run all models and datasets sequentially on a single CPU core

In [4]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet
from PMLBmini import test_on_PMLBmini

In [2]:
##############################################
#####      Equal/Random Guessing        ######
##############################################


class EqualGuessing(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        """Guess probabilty 0.5 for each class"""
        # Guess [0.5, 0.5]
        return np.ones((X.shape[0], 2)) * 0.5

    def decision_function(self, X):
        # Get the probabilities from predict_proba
        proba = self.predict_proba(X)
        # Calculate the log of ratios for binary classification
        decision = np.log((proba[:, 1] + 1e-10) / (proba[:, 0] + 1e-10))
        return decision
    


In [None]:
save_dir = "/home/nikita/Code/random-feature-boosting/results/PMLBmini/"

train_guessing_and_xgboost, test_guessing_and_xgboost = test_on_PMLBmini(
    EqualGuessing(),
    'EqualGuessing',
    [i for i in range(44)],
    save_dir, 
    other_saved_methods={"XGBoost"},
    )

In [21]:
!python PMLBmini.py \
    --models E2E_MLP_ResNet \
    --dataset_indices 0 \
    --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
    --seed 42

Dataset found, loading
Evaluating E2E_MLP_ResNet against {}...
Comparing E2E_MLP_ResNet on 0: analcatdata_aids
estimated name is not in methods: AutoGluon
estimated name is not in methods: AutoPrognosis
estimated name is not in methods: TabPFN
estimated name is not in methods: HyperFast
estimated name is not in methods: LightGBM
estimated name is not in methods: XGBoost
estimated name is not in methods: CatBoost
Testing E2E_MLP_ResNet
Fitting 5 folds for each of 10 candidates, totalling 50 fits
100%|█████████████████████████████████████████| 30/30 [00:00<00:00, 1576.06it/s]
[CV 1/5] END activation=ReLU(), batch_size=32, bottleneck_dim=32, end_lr_factor=0.01, hidden_dim=32, in_dim=4, loss=bce, lr=1e-05, modelClass=<class 'models.end2end.End2EndMLPResNet'>, n_blocks=1, n_epochs=30, out_dim=1, seed=42, weight_decay=1e-05;, score=0.000 total time=   0.7s
100%|█████████████████████████████████████████| 30/30 [00:00<00:00, 1857.80it/s]
[CV 2/5] END activation=ReLU(), batch_size=32, bottlenec

In [22]:
!python PMLBmini.py \
    --models "Logistic(ours)" \
    --dataset_indices 0 \
    --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
    --seed 42

Dataset found, loading
Evaluating Logistic(ours) against {}...
Comparing Logistic(ours) on 0: analcatdata_aids
estimated name is not in methods: AutoGluon
estimated name is not in methods: AutoPrognosis
estimated name is not in methods: TabPFN
estimated name is not in methods: HyperFast
estimated name is not in methods: LightGBM
estimated name is not in methods: XGBoost
estimated name is not in methods: CatBoost
Testing Logistic(ours)
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END hidden_dim=32, in_dim=4, l2_cls=100, modelClass=<class 'models.random_feature_representation_boosting.GradientRFRBoostClassifier'>, n_classes=2, n_layers=0, seed=42, upscale_type=identity;, score=0.417 total time=   0.6s
[CV 2/5] END hidden_dim=32, in_dim=4, l2_cls=100, modelClass=<class 'models.random_feature_representation_boosting.GradientRFRBoostClassifier'>, n_classes=2, n_layers=0, seed=42, upscale_type=identity;, score=0.667 total time=   0.0s
[CV 3/5] END hidden_dim=32, in_di

In [15]:
# Load the CSV and set the index
saved_results = pd.read_csv('https://raw.githubusercontent.com/RicardoKnauer/TabMini/master/plotting/results/test_scores_wide_3600.csv', delimiter=";", index_col=0)
saved_results.index.name = None
saved_results

Unnamed: 0,AutoPrognosis,AutoGluon,TabPFN,Logistic regression,HyperFast
parity5,0.27,1.0,0.02,0.17,0.02
analcatdata_fraud,0.86,0.8,0.79,0.77,0.73
analcatdata_aids,0.73,0.77,0.63,0.61,0.53
analcatdata_bankruptcy,0.98,0.98,0.96,0.97,0.88
analcatdata_japansolvent,0.85,0.88,0.91,0.85,0.91
labor,0.88,0.94,0.99,0.97,0.98
analcatdata_asbestos,0.87,0.84,0.85,0.86,0.87
lupus,0.84,0.79,0.82,0.85,0.79
postoperative_patient_data,0.49,0.55,0.44,0.38,0.34
analcatdata_cyyoung9302,0.89,0.85,0.87,0.87,0.84


In [26]:
t1 = pd.read_csv("results/PMLBmini/E2E_MLP_ResNet/test_0.csv", index_col=0)
t2 = pd.read_csv("results/PMLBmini/Logistic(ours)/test_0.csv", index_col=0)

t1.join(t2)

Unnamed: 0,E2E_MLP_ResNet,Logistic(ours)
analcatdata_aids,0.531829,0.585069


# old

In [29]:
combined_results = saved_results.copy()
# combined_results = pd.read_csv(Config.save_dir / "combined_results.csv", index_col=0)
for df in [test_guessing_and_xgboost, 
           test_RFNN,
           #test_RFNN_iid,
           test_E2E, 
           test_GRFRBoost,
           #test_GRFRBoostID,
           #test_GRFRBoost_iid,
           #test_GRFRBoostID_iid,
           #test_logistic,
           ]:
    combined_results = combined_results.join(df, how='inner')
combined_results = combined_results.round(2) # since PMLBmini's resulst are rounded to 2 decimals, for fair comparison
combined_results

#save
combined_results.to_csv(Config.save_dir / "combined_results.csv")


In [None]:
combined_results = pd.read_csv(Config.save_dir / "combined_results.csv", index_col=0)
# combined_results.drop(columns=['GRFRBoostID', 'GRFRBoost iid', 'GRFRBoostID iid', 'RFNN'], inplace=True)
combined_results

In [21]:
#combined_results[["GRFRBoost (ours)", "Logistic (mine)", "RFNN"]]

In [None]:
import matplotlib.pyplot as plt

# Generate the plot
plot = plot_critical_difference(combined_results.values,
                                combined_results.columns.tolist(), 
                                alpha=0.05, 
                                lower_better=False)

# Retrieve the figure and axes from the plot
fig = plot[0].figure
ax = plot[0]

# Adjust figure size
fig.set_size_inches(6, 3)

# Adjust layout
fig.tight_layout()

# Save the figures
fig.savefig(Config.save_dir / "PMLBmini_critical_difference.eps", bbox_inches='tight')
fig.savefig(Config.save_dir / "PMLBmini_critical_difference.png", bbox_inches='tight')

In [None]:
import pandas as pd
import numpy as np

def create_latex_table(df):
    table = """
\\begin{table}[t]
\\caption{Test accuracies on the concentric circles task.}
\\label{tab:concentric-circles}
\\vskip 0.15in
\\begin{center}
\\begin{small}
\\begin{sc}
\\begin{tabular}{lcc}
\\toprule
Model & Mean Acc & Std Dev \\\\
\\midrule
"""
    for model_name in df.columns:
        accs = df[model_name]
        mean_acc = np.mean(accs)
        std_acc = np.std(accs)
        table += f"{model_name} & {mean_acc:.4f} & {std_acc:.4f} \\\\\n"
    
    table += """
\\bottomrule
\\end{tabular}
\\end{sc}
\\end{small}
\\end{center}
\\vskip -0.1in
\\end{table}
"""
    return table

# Example usage
# Assuming `results_df` is your pandas DataFrame
latex_table = create_latex_table(combined_results)
print(latex_table)

# experiment on single

In [24]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from models.base import LogisticRegression
from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet

class Config:
    save_dir = Path.cwd() / "results" / "PMLBmini"

In [None]:
# experiment on a single dataset

#download dataset, cache it
dataset_save_path = Config.save_dir / 'PMLBmini_dataset.pkl'
if not os.path.exists(dataset_save_path):
    print("Dataset not found, downloading")
    dataset = tabmini.load_dataset(reduced=False)
    os.makedirs(Config.save_dir, exist_ok=True)
    with open(dataset_save_path, 'wb') as f:
        pickle.dump(dataset, f)
else:
    print("Dataset found, loading")
    with open(dataset_save_path, 'rb') as f:
        dataset = pickle.load(f)

In [26]:
from sklearn.model_selection import train_test_split

# 10: parity5
X, y = dataset["analcatdata_asbestos"]
# X, y = dataset["parity5"]
minmax = MinMaxScaler()
X = minmax.fit_transform(X)
X = torch.tensor(X).float()
y = torch.tensor(y.values)[..., None].float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
y

In [None]:
# model = LogisticRegression(
#     n_classes=2,
#     l2_lambda=0.0001,
#     max_iter = 300,
# )
np.random.seed(42)
torch.manual_seed(42)

model = GradientRFRBoostClassifier(
    in_dim=X.shape[1],
    n_classes=2,
    l2_cls=0.01,
    l2_ghat=0.001,
    n_layers=1,
    randfeat_xt_dim=128,
    randfeat_x0_dim=128,
    hidden_dim=128,
    upscale_type="identity",
    feature_type="SWIM",
    use_batchnorm=False,
    boost_lr=1e-0,
    activation="tanh"
)
model.fit(X_train, y_train)

from sklearn.metrics import roc_auc_score

logits = model(X_test)
print("logits", logits)
probs = nn.functional.sigmoid(logits)
print("out and y", torch.cat([logits, y_test], dim=1))
print("binary class pred and y", torch.cat([probs > 0.5, y_test], dim=1))
auc = roc_auc_score(y_test.numpy(), probs.detach().numpy())
print("test AUC:", auc)

#train
logits = model(X_train)
probs = nn.functional.sigmoid(logits)
auc = roc_auc_score(y_train.numpy(), probs.detach().numpy())
print("train AUC:", auc)