# Review errors in the results files `metadataset_errors*.csv` for GPU-algs-2

In [1]:
import pandas as pd

# errors_df = pd.read_csv("/home/shared/tabzilla/TabSurvey/metadataset_errors.csv")
errors_df = pd.read_csv("../TabSurvey/metadataset_errors.csv")

# filter by experiment
errors_df = errors_df.loc[errors_df["exp_name"].str.contains("algs-gpu-2")]

errors_df.head()

Unnamed: 0,results_bucket_path,dataset_name,alg_name,hparam_source,trial_number,alg_hparam_id,exp_name,exception
0,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,default,0,DANet__seed_0__trial_0,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
1,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_1_s0,1,DANet__seed_0__trial_1,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
2,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_2_s0,2,DANet__seed_0__trial_2,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
3,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_3_s0,3,DANet__seed_0__trial_3,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."
4,results/openml__APSFailure__168868/DANet/algs-...,openml__APSFailure__168868,DANet,random_4_s0,4,DANet__seed_0__trial_4,algs-gpu-2-datasets-a_120622_092527_3c1c.zip,"Traceback (most recent call last):\n File ""/h..."


In [2]:
# how many errors by dataset?
print(f"errors by dataset:\n {errors_df['dataset_name'].value_counts()}")

errors by dataset:
 openml__dionis__189355                                       109
openml__sylvine__168912                                       92
openml__sulfur__360966                                        92
openml__walking-activity__9945                                84
openml__skin-segmentation__9965                               82
openml__albert__189356                                        82
openml__Census-Income__168340                                 79
openml__audiology__7                                          77
openml__jungle_chess_2pcs_raw_endgame_complete__167119        74
openml__chess__3952                                           74
openml__Internet-Advertisements__167125                       73
openml__kropt__2076                                           72
openml__helena__168329                                        71
openml__soybean__41                                           70
openml__connect-4__146195                                     70
openm

In [3]:
# by alg...
print(f"errors by alg:\n {errors_df['alg_name'].value_counts()}")

errors by alg:
 rtdl_FTTransformer    4057
TabTransformer        1686
NODE                   516
SAINT                  333
DANet                  177
STG                     90
NAM                     74
rtdl_ResNet             60
DeepFM                  50
rtdl_MLP                30
Name: alg_name, dtype: int64


In [4]:
known_err_str = [
    "RuntimeError: CUDA out of memory.",
    "CUDA error: invalid configuration argument",
    "TimeoutException",  # our timeout exception
    "Cannot allocate memory",
]

# Triage errors by alg

In [5]:
errors_by_alg = {}
for alg_name in errors_df["alg_name"].unique():
    errors_by_alg[alg_name] = errors_df.loc[errors_df["alg_name"] == alg_name, "exception"].values

## TabTransformer

In [13]:
alg_name = "TabTransformer"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg TabTransformer: 1686
number of unique errors: 109


In [14]:
# get some known errors out of the way

tmp_known_err_str = known_err_str + [
    "must be the same as input size", # this is an issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/78
    "Dimension out of range", # another known issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/77
    "self must be a matrix", # another known issue with TabTransformer: https://github.com/naszilla/tabzilla/issues/79
]

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 91 of 109
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 1 of 109
error type: TimeoutException
num of unique errors with this string: 8 of 109
error type: Cannot allocate memory
num of unique errors with this string: 1 of 109
error type: must be the same as input size
num of unique errors with this string: 6 of 109
error type: Dimension out of range
num of unique errors with this string: 1 of 109
error type: self must be a matrix
num of unique errors with this string: 1 of 109


In [15]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]

In [16]:
print(f"{len(remaining_errors)} remaining errors")

0 remaining errors


# NODE

In [17]:
alg_name = "NODE"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg NODE: 516
number of unique errors: 143


In [18]:
# get some known errors out of the way

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 134 of 143
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 143
error type: TimeoutException
num of unique errors with this string: 9 of 143
error type: Cannot allocate memory
num of unique errors with this string: 0 of 143


In [19]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [20]:
len(remaining_errors)

0

# STG

In [21]:
alg_name = "STG"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg STG: 90
number of unique errors: 10


In [22]:
# get some known errors out of the way

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 10
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 10
error type: TimeoutException
num of unique errors with this string: 9 of 10
error type: Cannot allocate memory
num of unique errors with this string: 0 of 10


In [23]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]

In [24]:
print(f"{len(remaining_errors)} remaining errros")

1 remaining errros


In [25]:
print(remaining_errors[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 264, in cross_validation
    scorers["train"].eval(
  File "/home/shared/tabzilla/TabSurvey/utils/scorer.py", line 49, in eval
    mse = mean_squared_error(y_true, y_prediction)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 102, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/sklearn/utils/validation.py", line 899, in check_array
    _assert_all_finite(
  File "/opt/conda/envs/torch/lib/python3.10/site-packag

In [26]:
err_str = "Input contains NaN"
print(f"number of NaN errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of NaN errors: 20
algs: STG    20
Name: alg_name, dtype: int64
datasets: openml__chscase_foot__5012        2
openml__EgyptianSkulls__5040      2
openml__bodyfat__5514             2
openml__liver-disorders__52948    2
openml__Wine__190420              2
openml__cpu_small__4883           2
openml__veteran__4828             2
openml__dataset_sales__190418     2
openml__meta__4729                2
openml__aloi__12732               1
openml__mv__4774                  1
Name: dataset_name, dtype: int64


# NAM

In [27]:
alg_name = "NAM"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg NAM: 74
number of unique errors: 9


In [28]:

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 9
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 9
error type: TimeoutException
num of unique errors with this string: 9 of 9
error type: Cannot allocate memory
num of unique errors with this string: 0 of 9


Done.

# DeepFM

In [6]:
tmp_known_err_str = known_err_str + [
    "Singleton array",  # https://github.com/naszilla/tabzilla/issues/83
    ]

alg_name = "DeepFM"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg DeepFM: 50
number of unique errors: 36


In [7]:

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 36
error type: CUDA error: invalid configuration argument
num of unique errors with this string: 0 of 36
error type: TimeoutException
num of unique errors with this string: 6 of 36
error type: Cannot allocate memory
num of unique errors with this string: 0 of 36
error type: Singleton array
num of unique errors with this string: 30 of 36


In [31]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [9]:
err_str = "Singleton array"
print(f"number of Singleton errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of Singleton errors: 30
algs: DeepFM    30
Name: alg_name, dtype: int64
datasets: openml__sulfur__360966    30
Name: dataset_name, dtype: int64


# SAINT

In [32]:
alg_name = "SAINT"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg SAINT: 333
number of unique errors: 73


In [34]:
# get some known errors out of the way

known_err_str = [
    "RuntimeError: CUDA out of memory.",
    "TimeoutException",  # our timeout exception
    "Cannot allocate memory",
]

for err_str in known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 64 of 73
error type: TimeoutException
num of unique errors with this string: 9 of 73
error type: Cannot allocate memory
num of unique errors with this string: 0 of 73


In [35]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [42]:
es = [e for e in unique_errors if "CUDA out of memory" in e]

In [43]:
print(es[0])

Traceback (most recent call last):
  File "/home/shared/tabzilla/TabSurvey/tabzilla_experiment.py", line 137, in __call__
    result = cross_validation(model, self.dataset, self.time_limit)
  File "/home/shared/tabzilla/TabSurvey/tabzilla_utils.py", line 236, in cross_validation
    loss_history, val_loss_history = curr_model.fit(
  File "/home/shared/tabzilla/TabSurvey/models/saint.py", line 142, in fit
    loss.backward()
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/_tensor.py", line 363, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/opt/conda/envs/torch/lib/python3.10/site-packages/torch/autograd/__init__.py", line 173, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA out of memory. Tried to allocate 242.00 MiB (GPU 0; 14.76 GiB total capacity; 13.26 GiB already allocated; 43.75 MiB free; 13.65 GiB reserved in total by PyT

In [52]:
# print(f"datasets: {}")


d = list(errors_df[(errors_df['exception'].str.contains('CUDA out of memory')) & (errors_df['alg_name'] == 'SAINT')]['dataset_name'].unique())
for i in d: print(i)

openml__Amazon_employee_access__34539
openml__Click_prediction_small__190408
openml__GesturePhaseSegmentationProcessed__14969
openml__JapaneseVowels__3510
openml__LED-display-domain-7digit__125921
openml__Wine__190420
openml__Wisconsin-breast-cancer-cytology-features__361003
openml__adult-census__3953
openml__arrhythmia__5
openml__artificial-characters__14964
openml__breast-cancer__145799
openml__breast-w__15
openml__california__361089
openml__cjs__14967
openml__cmc__23
openml__dermatology__35
openml__diabetes__37
openml__dresses-sales__125920
openml__ecoli__145977
openml__eeg-eye-state__14951
openml__eye_movements__3897
openml__fertility__9984
openml__glass__40
openml__har__14970
openml__heart-c__48
openml__heart-h__50
openml__ilpd__9971
openml__ionosphere__145984
openml__kin8nm__2280
openml__kr-vs-kp__3
openml__madelon__9976
openml__nursery__9892
openml__page-blocks__30
openml__postoperative-patient-data__146210
openml__profb__3561
openml__semeion__9964
openml__solar-flare__2068
open

Done.

# rtdl_FTTransformer

In [29]:
alg_name = "rtdl_FTTransformer"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg rtdl_FTTransformer: 4057
number of unique errors: 30


In [42]:
tmp_known_err_str = known_err_str + [
    "If self.num_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
    "If self.cat_tokenizer is", # issue: https://github.com/naszilla/tabzilla/issues/84
    "FileNotFoundError: [Errno 2] No such file or directory: ",
]

for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 19 of 30
error type: TimeoutException
num of unique errors with this string: 0 of 30
error type: Cannot allocate memory
num of unique errors with this string: 0 of 30
error type: If self.num_tokenizer is
num of unique errors with this string: 2 of 30
error type: If self.cat_tokenizer is
num of unique errors with this string: 2 of 30
error type: FileNotFoundError: [Errno 2] No such file or directory: 
num of unique errors with this string: 7 of 30


In [43]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [54]:
err_str = "If self."
print(f"number of filenotfound errors: {len([e for e in errors_by_alg[alg_name] if err_str in e])}")

# which algs?
print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")

# which datasets?
print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")


number of filenotfound errors: 3780
algs: rtdl_FTTransformer    3780
Name: alg_name, dtype: int64
datasets: openml__APSFailure__168868    30
openml__mushroom__24          30
openml__poker-hand__9890      30
openml__phoneme__9952         30
openml__philippine__190410    30
                              ..
openml__dna__167140           30
openml__dionis__189355        30
openml__dilbert__168909       30
openml__diabetes__37          30
openml__yeast__145793         30
Name: dataset_name, Length: 126, dtype: int64


In [57]:
print(errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts())


openml__APSFailure__168868    30
openml__mushroom__24          30
openml__poker-hand__9890      30
openml__phoneme__9952         30
openml__philippine__190410    30
                              ..
openml__dna__167140           30
openml__dionis__189355        30
openml__dilbert__168909       30
openml__diabetes__37          30
openml__yeast__145793         30
Name: dataset_name, Length: 126, dtype: int64


# DANet

In [32]:
alg_name = "DANet"

print(f"number of errors for alg {alg_name}: {len(errors_by_alg[alg_name])}")
print(f"number of unique errors: {len(set(errors_by_alg[alg_name]))}")

unique_errors = list(set(errors_by_alg[alg_name]))


number of errors for alg DANet: 160
number of unique errors: 12


In [36]:
tmp_known_err_str = known_err_str + [
    "ValueError: Expected more than", # issue: https://github.com/naszilla/tabzilla/issues/84
]


for err_str in tmp_known_err_str:
    print(f"error type: {err_str}")
    print(f"num of unique errors with this string: {len([e for e in unique_errors if err_str in e])} of {len(unique_errors)}")


error type: RuntimeError: CUDA out of memory.
num of unique errors with this string: 0 of 12
error type: TimeoutException
num of unique errors with this string: 9 of 12
error type: Cannot allocate memory
num of unique errors with this string: 0 of 12
error type: ValueError: Expected more than
num of unique errors with this string: 3 of 12


In [37]:
# what are the remaining errors?

remaining_errors = [e for e in unique_errors if all([s not in e for s in tmp_known_err_str])]
print(f"{len(remaining_errors)} remaining errros")

0 remaining errros


In [44]:
# # which datasets?
# err_str = "ValueError: Expected more than"
# print(f"datasets: {errors_df[errors_df['exception'].str.contains(err_str)]['dataset_name'].value_counts()}")

# # which algs?
# print(f"algs: {errors_df[errors_df['exception'].str.contains(err_str)]['alg_name'].value_counts()}")



Done.