# Review errors in the results files `metadataset_errors*.csv` for GPU-algs-2

In [2]:
import pandas as pd

errors_df = pd.read_csv("/home/shared/tabzilla/TabSurvey/metadataset_errors.csv")

# filter by experiment
errors_df = errors_df.loc[errors_df["exp_name"].str.contains("algs-gpu-2")]

errors_df.head()

Unnamed: 0,results_bucket_path,dataset_name,alg_name,hparam_source,trial_number,alg_hparam_id,exp_name,exception
0,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,default,0,DANet__seed_0__trial_0,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
1,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_1_s0,1,DANet__seed_0__trial_1,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
2,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_2_s0,2,DANet__seed_0__trial_2,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
3,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_3_s0,3,DANet__seed_0__trial_3,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
4,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_4_s0,4,DANet__seed_0__trial_4,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."


In [3]:
# how many errors by dataset?
print(f"errors by dataset:\n {errors_df['dataset_name'].value_counts()}")

errors by dataset:
 openml__albert__189356                     82
openml__skin-segmentation__9965            77
openml__Internet-Advertisements__167125    73
openml__helena__168329                     71
openml__primary-tumor__146032              70
                                           ..
openml__chscase_foot__5012                  2
openml__churn__167141                       2
openml__hill-valley__145847                 1
openml__anneal__2867                        1
openml__ada_agnostic__3896                  1
Name: dataset_name, Length: 165, dtype: int64


In [6]:
# by alg...
print(f"errors by alg:\n {errors_df['alg_name'].value_counts()}")

errors by alg:
 rtdl_FTTransformer    2640
TabTransformer        1686
NODE                   516
SAINT                  161
DANet                  116
STG                     90
NAM                     65
DeepFM                  50
Name: alg_name, dtype: int64


In [89]:
known_err_str = [
    "RuntimeError: CUDA out of memory.",
    "CUDA error: invalid configuration argument",
    "TimeoutException",  # our timeout exception
    "Cannot allocate memory",
]

# Triage errors by alg

In [78]:
errors_by_alg = {}
for alg_name in errors_df["alg_name"].unique():
    errors_by_alg[alg_name] = errors_df.loc[errors_df["alg_name"] == alg_name, "exception"].values

## TabTransformer

In [79]:
alg_name = "TabTransformer"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg TabTransformer: 1686
number of unique errors: 109


In [80]:
# get some known errors out of the way

tmp_known_err_str = known_err_str + [
    "must be the same as input size", # this is an issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/78
    "Dimension out of range", # another known issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/77
    "self must be a matrix", # another known issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/79
]

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 91 of 109
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 1 of 109
error type: TimeoutException
num of unique errors with this string: 8 of 109
error type: Cannot allocate memory
num of unique errors with this string: 1 of 109
error type: must be the same as input size
num of unique errors with this string: 6 of 109
error type: Dimension out of range
num of unique errors with this string: 1 of 109
error type: self must be a matrix
num of unique errors with this string: 1 of 109


In [81]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]

In [82]:
print(f"{len(remaining_errors)} remaining errors")

0 remaining errors


# NODE

In [83]:
alg_name = "NODE"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg NODE: 516
number of unique errors: 143


In [84]:
# get some known errors out of the way

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 134 of 143
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 143
error type: TimeoutException
num of unique errors with this string: 9 of 143
error type: Cannot allocate memory
num of unique errors with this string: 0 of 143


In [85]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [86]:
len(remaining_errors)

0

# STG

In [90]:
alg_name = "STG"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg STG: 90
number of unique errors: 10


In [91]:
# get some known errors out of the way

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 10
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 10
error type: TimeoutException
num of unique errors with this string: 9 of 10
error type: Cannot allocate memory
num of unique errors with this string: 0 of 10


In [92]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [93]:
print(f"{len(remaining_errors)} remaining errros")

1 remaining errros


In [94]:
print(remaining_errors[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 264, in cross_validation
    scorers["train"].eval(
  File "/home/shared/tabzilla/TabSurvey/utils/scorer.py", line 49, in eval
    mse = mean_squared_error(y_true, y_prediction)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 102, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/utils/validation.py", line 899, in check_array
    _assert_all_finite(
  File "/opt/conda/envs/torch/lib/python3.10/site-packag

In [95]:
err_str = "Input contains NaN"
print(f"number of NaN errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of NaN errors: 20
algs: STG    20
Name: alg_name, dtype: int64
datasets: openml__EgyptianSkulls__5040      2
openml__Wine__190420              2
openml__bodyfat__5514             2
openml__chscase_foot__5012        2
openml__cpu_small__4883           2
openml__dataset_sales__190418     2
openml__liver-disorders__52948    2
openml__meta__4729                2
openml__veteran__4828             2
openml__aloi__12732               1
openml__mv__4774                  1
Name: dataset_name, dtype: int64


# NAM

In [96]:
alg_name = "NAM"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg NAM: 65
number of unique errors: 9


In [97]:

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 9
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 9
error type: TimeoutException
num of unique errors with this string: 9 of 9
error type: Cannot allocate memory
num of unique errors with this string: 0 of 9


Done.

# DeepFM

In [100]:
tmp_known_err_str = known_err_str + [
    "Singleton array",  # https://github.com/naszilla/tabzilla/issues/83
    ]

alg_name = "DeepFM"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg DeepFM: 50
number of unique errors: 36


In [101]:

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 36
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 36
error type: TimeoutException
num of unique errors with this string: 6 of 36
error type: Cannot allocate memory
num of unique errors with this string: 0 of 36
error type: Singleton array
num of unique errors with this string: 30 of 36


In [102]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [103]:
# err_str = "Singleton array"
# print(f"number of Singleton errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# # which algs?
# print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# # which datasets?
# print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


# SAINT

In [104]:
alg_name = "SAINT"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg SAINT: 161
number of unique errors: 47


In [105]:
# get some known errors out of the way

known_err_str = [
    "RuntimeError: CUDA out of memory.",
    "TimeoutException",  # our timeout exception
    "Cannot allocate memory",
]

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 38 of 47
error type: TimeoutException
num of unique errors with this string: 9 of 47
error type: Cannot allocate memory
num of unique errors with this string: 0 of 47


In [106]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


Done.

# rtdl_FTTransformer

In [123]:
alg_name = "rtdl_FTTransformer"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg rtdl_FTTransformer: 2640
number of unique errors: 17


In [120]:
tmp_known_err_str = known_err_str + [
    "If self.num_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
    "If self.cat_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
]


for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 13 of 17
error type: TimeoutException
num of unique errors with this string: 0 of 17
error type: Cannot allocate memory
num of unique errors with this string: 0 of 17
error type: If self.num_tokenizer is
num of unique errors with this string: 2 of 17
error type: If self.cat_tokenizer is
num of unique errors with this string: 2 of 17


In [121]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [122]:
# err_str = "If self.num_tokenizer is"
# print(f"number of tokenizer errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# # which algs?
# print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# # which datasets?
# print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


# DANet

In [124]:
alg_name = "DANet"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg DANet: 116
number of unique errors: 9


In [125]:
tmp_known_err_str = known_err_str + [
    # "If self.num_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
    # "If self.cat_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
]


for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 9
error type: TimeoutException
num of unique errors with this string: 9 of 9
error type: Cannot allocate memory
num of unique errors with this string: 0 of 9


In [126]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


Done.