# Review errors in the results files `metadataset_errors*.csv` for GPU-algs-1

In [1]:
import pandas as pd

errors_df = pd.read_csv("/home/shared/tabzilla/TabSurvey/metadataset_errors.csv")

# filter by experiment
errors_df = errors_df.loc[errors_df["exp_name"].str.contains("algs-gpu-1") | errors_df["exp_name"].str.contains("gpu-expt")]

errors_df.head()

Unnamed: 0,results_bucket_path,dataset_name,alg_name,hparam_source,trial_number,alg_hparam_id,exp_name,exception
30,results/openml__APSFailure__168868/TabNet/gpu-...,openml__APSFailure__168868,TabNet,random_2_s0,2,TabNet__seed_0__trial_2,gpu-expt-a_092222_065049_7185.zip,"Traceback (most recent call last):\n File ""/h..."
31,results/openml__APSFailure__168868/TabNet/gpu-...,openml__APSFailure__168868,TabNet,random_4_s0,4,TabNet__seed_0__trial_4,gpu-expt-a_092222_065049_7185.zip,"Traceback (most recent call last):\n File ""/h..."
32,results/openml__APSFailure__168868/VIME/gpu-ex...,openml__APSFailure__168868,VIME,default,0,VIME__seed_0__trial_0,gpu-expt-a_092422_190242_be2c.zip,"Traceback (most recent call last):\n File ""/h..."
33,results/openml__APSFailure__168868/VIME/gpu-ex...,openml__APSFailure__168868,VIME,random_1_s0,1,VIME__seed_0__trial_1,gpu-expt-a_092422_190242_be2c.zip,"Traceback (most recent call last):\n File ""/h..."
34,results/openml__APSFailure__168868/VIME/gpu-ex...,openml__APSFailure__168868,VIME,random_2_s0,2,VIME__seed_0__trial_2,gpu-expt-a_092422_190242_be2c.zip,"Traceback (most recent call last):\n File ""/h..."


In [2]:
# how many errors by dataset?
print(f"errors by dataset:\n {errors_df['dataset_name'].value_counts()}")

errors by dataset:
 openml__solar-flare__2068                  150
openml__Census-Income__168340               76
openml__poker-hand__9890                    75
openml__ldpa__9974                          75
openml__walking-activity__9945              74
                                          ... 
openml__bank-marketing__14965                2
openml__adult__7592                          2
openml__letter__6                            1
openml__christine__168908                    1
openml__Internet-Advertisements__167125      1
Name: dataset_name, Length: 119, dtype: int64


In [3]:
# by alg...
print(f"errors by alg:\n {errors_df['alg_name'].value_counts()}")

errors by alg:
 VIME        2634
MLP         2520
TabNet       154
CatBoost     122
XGBoost       13
Name: alg_name, dtype: int64


In [4]:
known_err_str = [
    "RuntimeError: CUDA out of memory.",
    "CUDA error: invalid configuration argument",
    "TimeoutException",  # our timeout exception
    "Cannot allocate memory",
]

# Triage errors by alg

In [5]:
errors_by_alg = {}
for alg_name in errors_df["alg_name"].unique():
    errors_by_alg[alg_name] = errors_df.loc[errors_df["alg_name"] == alg_name, "exception"].values

## VIME

In [6]:
alg_name = "VIME"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg VIME: 2634
number of unique errors: 21


In [7]:
# get some known errors out of the way

tmp_known_err_str = known_err_str + [
    "Target size",  # issue https://github.com/naszilla/tabzilla/issues/85
]

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 21
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 21
error type: TimeoutException
num of unique errors with this string: 17 of 21
error type: Cannot allocate memory
num of unique errors with this string: 2 of 21
error type: Target size
num of unique errors with this string: 1 of 21


In [8]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]

In [9]:
print(f"{len(remaining_errors)} remaining errors")

1 remaining errors


In [10]:
print(remaining_errors[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 237, in cross_validation
    loss_history, val_loss_history = curr_model.fit(
  File "/home/shared/tabzilla/TabSurvey/models/vime.py", line 47, in fit
    self.fit_self(X_unlab, p_m=self.params["p_m"], alpha=self.params["alpha"])
  File "/home/shared/tabzilla/TabSurvey/models/vime.py", line 151, in fit_self
    loss_mask = loss_func_mask(out_mask, batch_mask.to(self.device))
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 612, in forward
    return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  File "/o

In [11]:
err_str = "Expected all"
print(f"number of Expected errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of Expected errors: 2460
algs: VIME    2460
Name: alg_name, dtype: int64
datasets: openml__Census-Income__168340       30
openml__nursery__9892               30
openml__mushroom__24                30
openml__monks-problems-2__146065    30
openml__meta__4729                  30
                                    ..
openml__colic__25                   30
openml__cleveland__2285             30
openml__cjs__14967                  30
openml__chscase_foot__5012          30
openml__yeast__145793               30
Name: dataset_name, Length: 82, dtype: int64


Note - this was probably a bug with the experiments, 

# MLP

In [12]:
alg_name = "MLP"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg MLP: 2520
number of unique errors: 3


In [13]:
# get some known errors out of the way

tmp_known_err_str = known_err_str + [
    "Target size",  # issue https://github.com/naszilla/tabzilla/issues/85
]

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 3
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 3
error type: TimeoutException
num of unique errors with this string: 0 of 3
error type: Cannot allocate memory
num of unique errors with this string: 1 of 3
error type: Target size
num of unique errors with this string: 1 of 3


In [14]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]

In [15]:
len(remaining_errors)

1

In [16]:
print(remaining_errors[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 237, in cross_validation
    loss_history, val_loss_history = curr_model.fit(
  File "/home/shared/tabzilla/TabSurvey/models/mlp.py", line 31, in fit
    return super().fit(X, y, X_val, y_val)
  File "/home/shared/tabzilla/TabSurvey/models/basemodel_torch.py", line 94, in fit
    out = self.model(batch_X.to(self.device))
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 154, in forward
    raise RuntimeError("module must have its parameters and buffers "
RuntimeError: module must have its parameters and buffers on device cuda:0 (devi

This was a bug in an experiment, not with the implementation.

# TabNet

In [17]:
alg_name = "TabNet"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg TabNet: 154
number of unique errors: 17


In [18]:
# get some known errors out of the way

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 17
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 17
error type: TimeoutException
num of unique errors with this string: 16 of 17
error type: Cannot allocate memory
num of unique errors with this string: 0 of 17


In [19]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [20]:
print(f"{len(remaining_errors)} remaining errros")

1 remaining errros


In [21]:
print(remaining_errors[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 136, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 237, in cross_validation
    loss_history, val_loss_history = curr_model.fit(
  File "/home/shared/tabzilla/TabSurvey/models/tabnet.py", line 45, in fit
    self.model.fit(
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/pytorch_tabnet/abstract_model.py", line 223, in fit
    self._train_epoch(train_dataloader)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/pytorch_tabnet/abstract_model.py", line 434, in _train_epoch
    batch_logs = self._train_batch(X, y)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/pytorch_tabnet/abstract_model.py", line 469, in _train_batch
    output, M_loss = self.network(X)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _

In [22]:
err_str = "Expected more than"
print(f"number of Expected errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of Expected errors: 30
algs: TabNet    30
Name: alg_name, dtype: int64
datasets: openml__sulfur__360966    30
Name: dataset_name, dtype: int64


# CatBoost

In [23]:
alg_name = "CatBoost"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg CatBoost: 122
number of unique errors: 11


In [24]:

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 11
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 11
error type: TimeoutException
num of unique errors with this string: 9 of 11
error type: Cannot allocate memory
num of unique errors with this string: 0 of 11


In [25]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [26]:
print(f"{len(remaining_errors)} remaining errros")

2 remaining errros


In [27]:
print(remaining_errors[1])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 236, in cross_validation
    loss_history, val_loss_history = curr_model.fit(
  File "/home/shared/tabzilla/TabSurvey/models/tree_models.py", line 151, in fit
    self.model.fit(X, y, eval_set=(X_val, y_val))
  File "/opt/conda/envs/gbdt/lib/python3.9/site-packages/catboost/core.py", line 4716, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/opt/conda/envs/gbdt/lib/python3.9/site-packages/catboost/core.py", line 2021, in _fit
    train_params = self._prepare_train_params(
  File "/opt/conda/envs/gbdt/lib/python3.9/site-packages/catboost/core.py", line 1953, in _prepare_train_params
    _check_train_params(params)
  File "_catboos

In [28]:
err_str = "classes_count parameter"
print(f"number of Expected errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of Expected errors: 90
algs: CatBoost    90
Name: alg_name, dtype: int64
datasets: openml__solar-flare__2068    90
Name: dataset_name, dtype: int64


# XGBoost

In [29]:
tmp_known_err_str = known_err_str + [
    # "Singleton array",  # https://github.com/naszilla/tabzilla/issues/83
    ]

alg_name = "XGBoost"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg XGBoost: 13
number of unique errors: 8


In [30]:

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 8
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 8
error type: TimeoutException
num of unique errors with this string: 8 of 8
error type: Cannot allocate memory
num of unique errors with this string: 0 of 8


In [31]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros
