# PMLBmini experiments

This notebook runs the PMLBmini experiments, and compares RANDOM FEATURE BOOSTING and END2END to the saved PMLBmini models

NOTE that we assume tabmini is installed in the cwd https://github.com/RicardoKnauer/TabMini 

Should take no more than 30 minutes to run this notebook, ie run all models and datasets sequentially on a single CPU core

In [21]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet
from PMLBmini import test_on_PMLBmini

In [8]:
##############################################
#####      Equal/Random Guessing        ######
##############################################


class EqualGuessing(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        """Guess probabilty 0.5 for each class"""
        # Guess [0.5, 0.5]
        return np.ones((X.shape[0], 2)) * 0.5

    def decision_function(self, X):
        # Get the probabilities from predict_proba
        proba = self.predict_proba(X)
        # Calculate the log of ratios for binary classification
        decision = np.log((proba[:, 1] + 1e-10) / (proba[:, 0] + 1e-10))
        return decision
    


In [22]:
# save_dir = "/home/nikita/Code/random-feature-boosting/results/PMLBmini_rocauc/"
save_dir = "/home/nikita/Code/random-feature-boosting/results/PMLBmini_scoring_experiments/"
# train_guessing_and_xgboost, test_guessing_and_xgboost = test_on_PMLBmini(
#     EqualGuessing(),
#     'EqualGuessing',
#     [i for i in range(44)],
#     save_dir, 
#     other_saved_methods={"XGBoost"},
#     )

In [None]:
# !python PMLBmini.py \
#     --models E2E_MLP_ResNet \
#     --dataset_indices 0 \
#     --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
#     --seed 42

In [None]:
# !python PMLBmini.py \
#     --models "Logistic(ours)" \
#     --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
#     --seed 42

In [None]:
# # TODO wrong results for both built in rocauc and neglogloss
# for i in range(44):
#     !python PMLBmini.py \
#         --models "Logistic(ours)" \
#         --dataset_indices {i} \
#         --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
#         --seed 42

In [None]:
# !python PMLBmini.py \
#     --models GRFRBoost_featSWIM_upiid_linesearchTrue_freezeFalse \
#     --dataset_indices 0 \
#     --save_dir /home/nikita/Code/random-feature-boosting/results/PMLBmini/ \
#     --seed 42

In [23]:
# Load the CSV and set the index
saved_results = pd.read_csv('https://raw.githubusercontent.com/RicardoKnauer/TabMini/master/plotting/results/test_scores_wide_3600.csv', delimiter=";", index_col=0)
saved_results.index.name = None
saved_results

Unnamed: 0,AutoPrognosis,AutoGluon,TabPFN,Logistic regression,HyperFast
parity5,0.27,1.0,0.02,0.17,0.02
analcatdata_fraud,0.86,0.8,0.79,0.77,0.73
analcatdata_aids,0.73,0.77,0.63,0.61,0.53
analcatdata_bankruptcy,0.98,0.98,0.96,0.97,0.88
analcatdata_japansolvent,0.85,0.88,0.91,0.85,0.91
labor,0.88,0.94,0.99,0.97,0.98
analcatdata_asbestos,0.87,0.84,0.85,0.86,0.87
lupus,0.84,0.79,0.82,0.85,0.79
postoperative_patient_data,0.49,0.55,0.44,0.38,0.34
analcatdata_cyyoung9302,0.89,0.85,0.87,0.87,0.84


In [10]:
# join all the models
import itertools
model_names = []
for scoring in ["auc", "acc", "crossentropy"]:
    model_names.append(f"E2E_MLP_ResNet_{scoring}")
    model_names.append(f"RFNN_{scoring}")
    for feat in ["SWIM"]:
        for up in ["identity", "SWIM", "iid"]:
            for linesearch in [True, False]:
                for freeze in [False]:
                    for activation in ["tanh", "relu"]:
                            name = f"GRFRBoost_feat{feat}_up{up}_linesearch{linesearch}_freeze{freeze}_{activation}_{scoring}"
                            model_names.append(name)

In [40]:
# join all the models
import itertools
model_names = []
for scoring in ["crossentropy"]: # ["auc", "acc", "crossentropy"]:
    model_names.append(f"E2E_MLP_ResNet_{scoring}")
    model_names.append(f"RFNN_{scoring}")
    # for feat in ["SWIM"]:
    #     for up in ["identity", "SWIM", "iid"]:
    #         for linesearch in [True, False]:
    #             for freeze in [False]:
    #                 for activation in ["tanh", "relu"]:
    #                         name = f"GRFRBoost_feat{feat}_up{up}_linesearch{linesearch}_freeze{freeze}_{activation}_{scoring}"
    #                         model_names.append(name)
#model_names.append("GRFRBoost_featSWIM_upidentity_linesearchTrue_freezeFalse_relu_crossentropy")
#model_names.append("GRFRBoost_featSWIM_upidentity_linesearchFalse_freezeFalse_relu_crossentropy")
model_names.append("GRFRBoost_featSWIM_upidentity_linesearchFalse_freezeFalse_tanh_crossentropy")
#model_names.append("GRFRBoost_featSWIM_upSWIM_linesearchFalse_freezeFalse_tanh_auc")

In [46]:
# # join all the models
# import itertools
# model_names = ["E2E_MLP_ResNet", "RFNN", #"RFNN_relu",
#                 "GRFRBoost_featSWIM_upidentity_linesearchTrue_freezeFalse_relu",  
#                 #"GRFRBoost_featSWIM_upiid_linesearchFalse_freezeFalse",    
#                 #"GRFRBoost_featSWIM_upidentity_linesearchFalse_freezeFalse_relu", 
#                 #"GRFRBoost_featSWIM_upSWIM_linesearchFalse_freezeFalse",           
#                 #"GRFRBoost_featSWIM_upiid_linesearchFalse_freezeFalse_relu",  
#                 #"GRFRBoost_featSWIM_upSWIM_linesearchFalse_freezeTrue",
#                ]
# # for feat, up, linesearch, freeze, relu in itertools.product(["SWIM"], ["SWIM"], [False], [False], ["_relu"]):
# #     name = f"GRFRBoost_feat{feat}_up{up}_linesearch{linesearch}_freeze{freeze}{relu}"
# #     model_names.append(name)

In [None]:
# ["SWIM"] # feat
# ["SWIM", "iid", "identity"] # up
# [False, True] # linesearch
# [False] # freeze
# ["", "relu"] # relu
# ["auc", "crossentropy", "acc"] # scoring
# #["ridge", "ridgecv"] # ghat module        
# 1*3*2*1*2*3

In [None]:
combined_results = saved_results.copy()
for model_name in model_names:
    results_one_model = [pd.read_csv(f"{save_dir}{model_name}/test_{i}.csv", index_col=0)
                         for i in range(44)]
    df_one_model = pd.concat(results_one_model, axis=0)
    combined_results = combined_results.join(df_one_model)
combined_results = combined_results.round(2)
combined_results

In [None]:
combined_results.mean(axis=0).sort_values(ascending=False)

In [None]:
combined_results.rank(axis=1, ascending=False).mean(axis=0).sort_values(ascending=True)

In [None]:
nan_columns = combined_results.columns[combined_results.isna().any()].tolist()
print(nan_columns)
combined_results[nan_columns]
combined_results = combined_results.drop(columns=nan_columns)

In [None]:
combined_results.query("RFNN_auc > GRFRBoost_featSWIM_upSWIM_linesearchFalse_freezeFalse_relu_auc")

In [None]:
import matplotlib.pyplot as plt

# Generate the plot
plot = plot_critical_difference(combined_results.values,
                                combined_results.columns.tolist(), 
                                alpha=0.05, 
                                lower_better=False)

# Retrieve the figure and axes from the plot
fig = plot[0].figure
ax = plot[0]

# Adjust figure size
fig.set_size_inches(20, 3)

# Adjust layout
fig.tight_layout()

# Save the figures
fig.savefig(f"{save_dir}PMLBmini_critical_difference.eps", bbox_inches='tight')
fig.savefig(f"{save_dir}PMLBmini_critical_difference.png", bbox_inches='tight')

In [None]:
import pandas as pd
import numpy as np

def create_latex_table(df):
    table = """
\\begin{table}[t]
\\caption{Test accuracies on the concentric circles task.}
\\label{tab:concentric-circles}
\\vskip 0.15in
\\begin{center}
\\begin{small}
\\begin{sc}
\\begin{tabular}{lcc}
\\toprule
Model & Mean Acc & Std Dev \\\\
\\midrule
"""
    for model_name in df.columns:
        accs = df[model_name]
        mean_acc = np.mean(accs)
        std_acc = np.std(accs)
        table += f"{model_name} & {mean_acc:.4f} & {std_acc:.4f} \\\\\n"
    
    table += """
\\bottomrule
\\end{tabular}
\\end{sc}
\\end{small}
\\end{center}
\\vskip -0.1in
\\end{table}
"""
    return table

# Example usage
# Assuming `results_df` is your pandas DataFrame
latex_table = create_latex_table(combined_results)
print(latex_table)

# experiment on single

In [8]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import os
import pickle
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import tabmini
import aeon
from aeon.visualisation import plot_critical_difference, plot_significance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

from models.base import LogisticRegression
from models.gridsearch_wrapper import SKLearnWrapper
from models.random_feature_representation_boosting import GradientRFRBoostClassifier
from models.end2end import End2EndMLPResNet

class Config:
    save_dir = Path.cwd() / "results" / "PMLBmini"



#download dataset, cache it
dataset_save_path = Config.save_dir / 'PMLBmini_dataset.pkl'
if not os.path.exists(dataset_save_path):
    print("Dataset not found, downloading")
    dataset = tabmini.load_dataset(reduced=False)
    os.makedirs(Config.save_dir, exist_ok=True)
    with open(dataset_save_path, 'wb') as f:
        pickle.dump(dataset, f)
else:
    print("Dataset found, loading")
    with open(dataset_save_path, 'rb') as f:
        dataset = pickle.load(f)

Dataset found, loading


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 10: parity5
X, y = dataset["analcatdata_fraud"]
# X, y = dataset["parity5"]
scaler = MinMaxScaler()
#scaler = StandardScaler()
X = scaler.fit_transform(X)
X = torch.tensor(X).float()
y = torch.tensor(y.values)[..., None].float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X.shape

In [2]:
# model = LogisticRegression(
#     n_classes=2,
#     l2_lambda=0.0001,
#     max_iter = 300,
# )
np.random.seed(42)
torch.manual_seed(42)

model = GradientRFRBoostClassifier(
    in_dim=X.shape[1],
    n_classes=2,
    l2_cls=0.01,
    l2_ghat=0.0001,
    n_layers=1,
    randfeat_xt_dim=512,
    randfeat_x0_dim=512,
    hidden_dim=512,
    upscale_type="SWIM",
    feature_type="SWIM",
    use_batchnorm=False,
    boost_lr=1,
    activation="tanh",
    do_linesearch=False,
    freeze_top_at_t=2,
    ghat_ridge_solver="ridgecv",
)
model.fit(X_train, y_train)

from sklearn.metrics import roc_auc_score

logits = model(X_test)
print("logits", logits)
probs = nn.functional.sigmoid(logits)
# print("out and y", torch.cat([logits, y_test], dim=1))
# print("binary class pred and y", torch.cat([probs > 0.5, y_test], dim=1))
auc = roc_auc_score(y_test.numpy(), probs.detach().numpy())
print("test AUC:", auc)
print("test accuracy:", (probs > 0.5).eq(y_test).float().mean().item())
print("test cross-entropy:", nn.functional.binary_cross_entropy_with_logits(logits, y_test).item())

#train
model.eval()
logits = model(X_train)
probs = nn.functional.sigmoid(logits)
auc = roc_auc_score(y_train.numpy(), probs.detach().numpy())
print("train AUC:", auc)
print("train accuracy:", (probs > 0.5).eq(y_train).float().mean().item())
print("train cross-entropy:", nn.functional.binary_cross_entropy_with_logits(logits, y_train).item())

NameError: name 'X' is not defined

In [None]:
# TODO cross entropy loss
# TODO ridgeCV


# bugfix

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 10: parity5
X, y = dataset["analcatdata_fraud"]
# X, y = dataset["parity5"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
X.shape

(42, 11)

In [20]:
from PMLBmini import WrapperGridSearch

model = WrapperGridSearch(
    param_grid={
        "modelClass": [GradientRFRBoostClassifier],
        "l2_cls": [10, 1, 0.1, 0.01, 0.001, 0.0001],
        "l2_ghat": [0.000001],
        "n_layers": [1],
        "randfeat_xt_dim": [512],
        "randfeat_x0_dim": [512],
        "hidden_dim": [512],
        "upscale_type": ["SWIM"],
        "feature_type": ["SWIM"],
        "use_batchnorm": [False],
        "boost_lr": [1],
        "activation": ["relu"],
        "do_linesearch": [True],
        "freeze_top_at_t": [2],
        "ghat_ridge_solver": ["solve"],
    },
    verbose=3,
    scaler=MinMaxScaler(),
    seed=42,
    scoring = "neg_log_loss",
)
model.fit(X_train, y_train)
pred_test = model.predict_proba(X_test)
pred_train = model.predict_proba(X_train)
print("test AUC:", roc_auc_score(y_test, pred_test[:, 1]))
print("train AUC:", roc_auc_score(y_train, pred_train[:, 1]))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END activation=relu, boost_lr=1, do_linesearch=True, feature_type=SWIM, freeze_top_at_t=2, ghat_ridge_solver=solve, hidden_dim=512, in_dim=11, l2_cls=10, l2_ghat=1e-06, modelClass=<class 'models.random_feature_representation_boosting.GradientRFRBoostClassifier'>, n_classes=2, n_layers=1, randfeat_x0_dim=512, randfeat_xt_dim=512, seed=42, upscale_type=SWIM, use_batchnorm=False;, score=-1.489 total time=   0.1s
[CV 2/3] END activation=relu, boost_lr=1, do_linesearch=True, feature_type=SWIM, freeze_top_at_t=2, ghat_ridge_solver=solve, hidden_dim=512, in_dim=11, l2_cls=10, l2_ghat=1e-06, modelClass=<class 'models.random_feature_representation_boosting.GradientRFRBoostClassifier'>, n_classes=2, n_layers=1, randfeat_x0_dim=512, randfeat_xt_dim=512, seed=42, upscale_type=SWIM, use_batchnorm=False;, score=-0.648 total time=   0.1s
[CV 3/3] END activation=relu, boost_lr=1, do_linesearch=True, feature_type=SWIM, freeze_top_at_t