This file is for evaluating the performance of methods on large tabular datasets.

In [1]:
!echo $HOSTNAME




In [2]:
import matplotlib.pyplot as plt

from mothernet.evaluation.baselines import tabular_baselines

import seaborn as sns
import numpy as np
import warnings
warnings.simplefilter("ignore", FutureWarning)  # openml deprecation of array return type
from mothernet.datasets import load_openml_list, open_cc_valid_dids, open_cc_dids, open_cc_large_dids
from mothernet.evaluation import tabular_metrics
from mothernet.prediction.tabpfn import TabPFNClassifier
import os
from mothernet.evaluation.baselines.distill_mlp import DistilledTabPFNMLP
from mothernet.prediction.mothernet import MotherNetClassifier
from functools import partial
from mothernet.evaluation.tabular_evaluation import eval_on_datasets
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from mothernet.prediction.mothernet import ShiftClassifier, EnsembleMeta, MotherNetClassifier
from sklearn.impute import SimpleImputer
from mothernet.prediction.mothernet_additive import MotherNetAdditiveClassifier

from interpret.glassbox import ExplainableBoostingClassifier


from hyperfast import HyperFastClassifier

# transformers don't have max times
import warnings
import pandas as pd

import datetime

import pickle

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Datasets

In [3]:
cc_test_datasets_multiclass, cc_test_datasets_multiclass_df = load_openml_list(
    open_cc_large_dids, 
    # [1491, 41986, 42343],
    multiclass=True,
    shuffled=True, 
    filter_for_nan=False, 
    max_samples = 1000000, 
    num_feats=5000, 
    return_capped=True,
    max_num_classes=100,
)

Number of datasets: 19


In [4]:
eval_positions = [5000000]
max_features = 5000
n_samples = 1000000
base_path = os.path.join('../')
overwrite = False
# max_times only affect non-nn models, nn models are not affected by max_times
# for non-nn models, when the runtime is longer than the max_time, it should stop
max_times = [1, 15, 30, 60, 60 * 5, 60 * 15, 60*60] 
metric_used = tabular_metrics.auc_metric
task_type = 'multiclass'

In [4]:
!mkdir -p {base_path}/results
!mkdir -p {base_path}/results/tabular/
!mkdir -p {base_path}/results/tabular/multiclass/

KeyboardInterrupt: 

In [None]:
cc_test_datasets_multiclass_df['isNumeric'] = (
    cc_test_datasets_multiclass_df.NumberOfSymbolicFeatures == 1
    ) & (cc_test_datasets_multiclass_df.NumberOfInstancesWithMissingValues == 0)

In [None]:
cc_test_datasets_multiclass_df['NumberOfInstances'] =  cc_test_datasets_multiclass_df['NumberOfInstances'].astype(int)
cc_test_datasets_multiclass_df['NumberOfFeatures'] =  cc_test_datasets_multiclass_df['NumberOfFeatures'].astype(int)
cc_test_datasets_multiclass_df['NumberOfClasses'] =  cc_test_datasets_multiclass_df['NumberOfClasses'].astype(int)

print(cc_test_datasets_multiclass_df[['did', 'name', 'NumberOfFeatures', 'NumberOfInstances', 'NumberOfClasses']].rename(columns={'NumberOfFeatures': "d", "NumberOfInstances":"n", "NumberOfClasses": "k"}).to_latex(index=False))

\begin{tabular}{rlrrr}
\toprule
did & name & d & n & k \\
\midrule
137 & BNG(tic-tac-toe) & 10 & 39366 & 2 \\
843 & house_8L & 9 & 22784 & 2 \\
846 & elevators & 19 & 16599 & 2 \\
981 & kdd_internet_usage & 69 & 10108 & 2 \\
1220 & Click_prediction_small & 10 & 39948 & 2 \\
1459 & artificial-characters & 8 & 10218 & 10 \\
1461 & bank-marketing & 17 & 45211 & 2 \\
1531 & volcanoes-b1 & 4 & 10176 & 5 \\
1532 & volcanoes-b2 & 4 & 10668 & 5 \\
1590 & adult & 15 & 48842 & 2 \\
4135 & Amazon_employee_access & 10 & 32769 & 2 \\
4534 & PhishingWebsites & 31 & 11055 & 2 \\
23512 & higgs & 29 & 98050 & 2 \\
40668 & connect-4 & 43 & 67557 & 3 \\
41027 & jungle_chess_2pcs_raw_endgame_complete & 7 & 44819 & 3 \\
41162 & kick & 33 & 72983 & 2 \\
41168 & jannis & 55 & 83733 & 4 \\
42733 & Click_prediction_small & 12 & 39948 & 2 \\
42734 & okcupid-stem & 20 & 50789 & 3 \\
\bottomrule
\end{tabular}



In [None]:
preview = cc_test_datasets_multiclass_df[['did', 'name', 'NumberOfFeatures', 'NumberOfInstances', 'NumberOfClasses']].reset_index(drop=True)
preview

Unnamed: 0,did,name,NumberOfFeatures,NumberOfInstances,NumberOfClasses
0,137,BNG(tic-tac-toe),10,39366,2
1,843,house_8L,9,22784,2
2,846,elevators,19,16599,2
3,981,kdd_internet_usage,69,10108,2
4,1220,Click_prediction_small,10,39948,2
5,1459,artificial-characters,8,10218,10
6,1461,bank-marketing,17,45211,2
7,1531,volcanoes-b1,4,10176,5
8,1532,volcanoes-b2,4,10668,5
9,1590,adult,15,48842,2


# Method Evaluation
This section runs baselines and saves results locally.

## ResNet

In [None]:
max_times = [60 * 60]
# these will all be evaluated on CPU because they are given as callables, which is a weird way to do it.
clf_dict= {
    'resnet_gpu': resnet_metric
}

results_resnet = [
    eval_on_datasets(
        'multiclass', 
        model, 
        model_name, 
        cc_test_datasets_multiclass, 
        eval_positions=eval_positions, 
        max_times=max_times,
        metric_used=metric_used, 
        split_numbers=[1, 2, 3, 4, 5], # seed indicate how to split the dataset
        n_samples=n_samples, 
        base_path=base_path, 
        n_jobs=1, 
        device="cuda", 
        verbose=0
    )
    for model_name, model in clf_dict.items()
]

evaluating resnet_gpu on cuda


  0%|          | 0/95 [00:00<?, ?it/s]

In [None]:
max_times = [60*60]
split_numbers = [1, 2, 3, 4, 5]

max_times = [10000000]
split_numbers = [1]
fetch_only = False # get the existing result or not 

clf_dict= {
    'knn': knn_metric,
    'rf_new_params': random_forest_metric,
    'xgb': xgb_metric,
    'logistic': logistic_metric,
    'mlp': mlp_metric
}

test_baselines = []

results_baselines = [
    eval_on_datasets(
    'multiclass', 
    model, 
    model_name, 
    cc_test_datasets_multiclass, 
    eval_positions=eval_positions, 
    max_times=max_times,
    metric_used=metric_used, 
    split_numbers=split_numbers,
    n_samples=n_samples, 
    base_path=base_path, 
    fetch_only=fetch_only,
    ) for model_name, model in clf_dict.items()
]

evaluating knn on cpu


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  19 | elapsed: 18.8min remaining: 25.8min


# TabPFN

In [11]:
max_times = [10000000]
device = "cuda"

results_tabpfn = []

tabpfn_models = {
    # 'ssm_tabpfn_b4_largedatasetTrue_modellinear_attention_nsamples50000_08_01_2024_22_05_50': ('TabFast', '110'),
    # 'prior_diff_real_checkpoint_n_0': ('TabPFN', '100'),
    'ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40': ('TabFlex', '1410'),
    # 'ssm_tabpfn_modellinear_attention_08_28_2024_19_00_44': ('TabSmall', '1210'),
}

for model_string in tabpfn_models:
    model = TabPFNClassifier(
        device=device, 
        model_string=model_string, 
        epoch=tabpfn_models[model_string][1], 
        N_ensemble_configurations=3,
    )

    result = eval_on_datasets(
        'multiclass', 
        model, 
        model_string, 
        cc_test_datasets_multiclass, 
        eval_positions=eval_positions, 
        max_times=max_times,
        metric_used=metric_used, 
        split_numbers=[1, 2, 3, 4, 5],
        n_samples=n_samples, 
        base_path=base_path, 
        overwrite=False, 
        fetch_only=False,
        n_jobs=1, 
        device=device,
    )
    
    results_tabpfn.append(result)

# store the results as pickle
with open(f'{base_path}/results/tabular/multiclass/tabpfn_summary.pkl', 'wb') as f:
    pickle.dump(results_tabpfn, f)
    

evaluating ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40 on cuda


evaluating ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40 on cuda BNG(tic-tac-toe):   0%|          | 0/95 [00:00<?, ?it/s]

Number of parameters in backbone:  25234432


evaluating ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40 on cuda higgs:  63%|██████▎   | 60/95 [01:39<00:57,  1.65s/it]                 


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.12 GiB (GPU 0; 39.56 GiB total capacity; 6.09 GiB already allocated; 1.03 GiB free; 7.20 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Results Preview

In [6]:
flat_results = []
for per_dataset in results_tabpfn:
    for result in per_dataset:
        row = {}
        for key in ['dataset', 'model', 'mean_metric', 'split', 'max_time']:
            row[key] = result[key]
        try:
            best_configs_key, = [k for k in result.keys() if "best_configs" in k]
        except:
            print(per_dataset)
            continue
        if result[best_configs_key][0] is not None:
            row.update(result[best_configs_key][0])
        row['mean_metric'] = float(row["mean_metric"].numpy())
        flat_results.append(row)

results_df = pd.DataFrame(flat_results)

[{'metric_used': 'roc_auc_ovo', 'n_samples': 1000000, 'eval_positions': [5000000], 'mean_metric': tensor(nan), 'model': 'ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40', 'dataset': 'BNG(tic-tac-toe)', 'split': 1, 'max_time': 10000000}, {'metric_used': 'roc_auc_ovo', 'n_samples': 1000000, 'eval_positions': [5000000], 'mean_metric': tensor(nan), 'model': 'ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40', 'dataset': 'BNG(tic-tac-toe)', 'split': 2, 'max_time': 10000000}, {'metric_used': 'roc_auc_ovo', 'n_samples': 1000000, 'eval_positions': [5000000], 'mean_metric': tensor(nan), 'model': 'ssm_tabpfn_b4_maxnumclasses100_modellinear_attention_numfeatures1000_n1024_validdatanew_warm_08_23_2024_19_25_40', 'dataset': 'BNG(tic-tac-toe)', 'split': 3, 'max_time': 10000000}, {'metric_used': 'roc_auc_ovo', 'n_samples': 1000000, 'eval_positions': [5000000], 'mean_metric':

In [7]:
results_view = results_df[['model', 'dataset', 'fit_time', 'inference_time', 'mean_metric']].copy()
# compute the mean of fit time and inference time across different splits when the model and datasets are the same
# and the rows should be reduced
results_view = results_view.groupby(['model', 'dataset']).mean().reset_index()
results_view



Unnamed: 0,model,dataset,fit_time,inference_time,mean_metric
0,prior_diff_real_checkpoint_n_0,Amazon_employee_access,0.001786,2.587608,0.627332
1,prior_diff_real_checkpoint_n_0,BNG(tic-tac-toe),0.05132,3.574261,0.83555
2,prior_diff_real_checkpoint_n_0,Click_prediction_small,0.002196,3.660487,0.652219
3,prior_diff_real_checkpoint_n_0,PhishingWebsites,0.000731,0.69302,0.992808
4,prior_diff_real_checkpoint_n_0,adult,0.003019,5.569379,0.900397
5,prior_diff_real_checkpoint_n_0,artificial-characters,0.001109,0.39722,0.959423
6,prior_diff_real_checkpoint_n_0,bank-marketing,0.002371,4.913199,0.897934
7,prior_diff_real_checkpoint_n_0,connect-4,0.004754,11.628309,0.632694
8,prior_diff_real_checkpoint_n_0,elevators,0.001308,1.017949,0.946148
9,prior_diff_real_checkpoint_n_0,higgs,0.005151,23.481521,0.733612


In [8]:
cols = ['fit_time', 'inference_time', 'mean_metric']

# Pivot the table
pivot_table = results_view.pivot(index='dataset', columns='model', values=cols)

# Flatten column names
pivot_table.columns = [f'{col[1]}_{col[0]}' for col in pivot_table.columns]

# Reset index to make 'dataset' a column
pivot_table = pivot_table.reset_index()

columns_mapping = {f'{model_string}_{col}': f'{tabpfn_models[model_string][0]}_{col}' for model_string in tabpfn_models for col in cols}

# Rename columns for clarity
pivot_table = pivot_table.rename(columns=columns_mapping)
pivot_table

Unnamed: 0,dataset,TabPFN_fit_time,TabFast_fit_time,TabSmall_fit_time,TabPFN_inference_time,TabFast_inference_time,TabSmall_inference_time,TabPFN_mean_metric,TabFast_mean_metric,TabSmall_mean_metric
0,Amazon_employee_access,0.001786,0.001644,0.001324,2.587608,0.820782,0.785074,0.627332,0.627648,0.546619
1,BNG(tic-tac-toe),0.05132,0.149949,0.078171,3.574261,0.961387,0.936264,0.83555,0.834778,0.808015
2,Click_prediction_small,0.002196,0.001807,0.001906,3.660487,0.88564,0.892747,0.652219,0.659269,0.648059
3,PhishingWebsites,0.000731,0.000845,0.000786,0.69302,0.552802,0.518347,0.992808,0.990364,0.986796
4,adult,0.003019,0.002626,0.00231,5.569379,1.304464,1.289713,0.900397,0.905406,0.899496
5,artificial-characters,0.001109,0.001224,0.000948,0.39722,0.268928,0.254098,0.959423,0.950158,0.935887
6,bank-marketing,0.002371,0.001902,0.002053,4.913199,1.296291,1.281062,0.897934,0.901807,0.889341
7,connect-4,0.004754,0.003742,0.003847,11.628309,2.997064,2.990771,0.632694,0.687039,0.656626
8,elevators,0.001308,0.000966,0.000947,1.017949,0.562852,0.563888,0.946148,0.948136,0.947898
9,higgs,0.005151,0.005305,0.004821,23.481521,4.18843,4.183557,0.733612,0.753547,0.743553


In [9]:
# Calculate averages
metrics = ['fit_time', 'inference_time', 'mean_metric']
methods = ['TabFast', 'TabPFN', 'TabSmall', 'TabFlex']

averages = {metric: {method: pivot_table[f'{method}_{metric}'].mean() for method in methods} for metric in metrics}

# Prepare for plotting
fig, axes = plt.subplots(1, 3, figsize=(20, 8))
width = 0.2
x = np.arange(len(methods))

# Color map for consistency across subplots
colors = plt.cm.get_cmap('Set3')(np.linspace(0, 1, len(methods)))

# Plot each metric in a separate subplot
for i, metric in enumerate(metrics):
    ax = axes[i]
    values = [averages[metric][method] for method in methods]
    bars = ax.bar(x, values, width, alpha=0.8, color=colors)
    
    ax.set_ylabel(f'Average {metric.replace("_", " ").title()}', fontsize=25)
    ax.tick_params(axis='y', labelsize=25)
    # ax.set_title(f'Comparison of {metric.replace("_", " ").title()}', fontsize=25)
    ax.set_xticks(x, fontsize=25)
    ax.set_xticklabels(methods, rotation=45, ha='right', fontsize=25)
    ax.yaxis.grid(True)
    
    if i == 2: ax.set_ylim(0.75, 0.85)
    
    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        # ax.text(bar.get_x() + bar.get_width()/2., height,
        #         f'{height:.6f}',
        #         ha='center', va='bottom', rotation=90)

# Adjust layout and display the plot
plt.tight_layout()
plt.suptitle('Comparison of Average Metrics across Different Methods', fontsize=25, y=1.05)
plt.show()

KeyError: 'TabFlex_fit_time'