-
Notifications
You must be signed in to change notification settings - Fork 38
Open
Description
I was using the colab notesbook for training a model using wav2vec2forclassification. In the preprocessing step when I am running the following code -
train_dataset = train_dataset.map(
preprocess_function,
batch_size=10,
batched=True,
)
eval_dataset = eval_dataset.map(
preprocess_function,
batch_size=10,
batched=True,
)
I am getting into the following error, I guess it has something to do with hugging face datasets -
0%| | 0/1765 [00:00<?, ?ba/s]
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
/tmp/ipykernel_29222/3011913806.py in <module>
2
3
----> 4 train_dataset = train_dataset.map(
5 preprocess_function,
6 batch_size=10,
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1667
1668 if num_proc is None or num_proc == 1:
-> 1669 return self._map_single(
1670 function=function,
1671 with_indices=with_indices,
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
183 }
184 # apply actual function
--> 185 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
186 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
187 # re-apply format to the output
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
395 # Call actual function
396
--> 397 out = func(self, *args, **kwargs)
398
399 # Update fingerprint of in-place transforms + update in-place history of transforms
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc)
2036 else:
2037 batch = cast_to_python_objects(batch)
-> 2038 writer.write_batch(batch)
2039 if update_data and writer is not None:
2040 writer.finalize() # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/arrow_writer.py in write_batch(self, batch_examples, writer_batch_size)
401 typed_sequence = OptimizedTypedSequence(batch_examples[col], type=col_type, try_type=col_try_type, col=col)
402 typed_sequence_examples[col] = typed_sequence
--> 403 pa_table = pa.Table.from_pydict(typed_sequence_examples)
404 self.write_table(pa_table, writer_batch_size)
405
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_pydict()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.asarray()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._handle_arrow_array_protocol()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/datasets/arrow_writer.py in __arrow_array__(self, type)
105 out = numpy_to_pyarrow_listarray(self.data)
106 else:
--> 107 out = pa.array(self.data, type=type)
108 if trying_type and out[0].as_py() != self.data[0]:
109 raise TypeError(
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._sequence_to_array()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
~/.pyenv/versions/3.8.7/envs/bg_classifier/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Can only convert 1-dimensional array values
Metadata
Metadata
Assignees
Labels
No labels