-
-
Notifications
You must be signed in to change notification settings - Fork 18.9k
Closed
Labels
DependenciesRequired and optional dependenciesRequired and optional dependencies
Description
When loading a file that has been modified in S3 directly with pd.read_parquet() from another terminal, we get the ArrowIOError: Invalid parquet file. Corrupt footer. Error.
The code to replicate this error is:
import pandas as pd
pd.show_versions()
n = 5
df = pd.DataFrame()
for i in range(n):
df[str(i)] = [1, 2, 3]
bucket = 'XXXXXX'
file_name = f's3://{bucket}/dataframe.pqt'
df.to_parquet(file_name)
df = pd.read_parquet(file_name)
Then upload a modified DataFrame with the same name from another terminal:
import pandas as pd
n = 6
df = pd.DataFrame()
for i in range(n):
df[str(i)] = [1, 2, 3]
bucket = 'XXXXXX'
file_name = f's3://{bucket}/dataframe.pqt'
df.to_parquet(file_name)
And after trying to read it again from the first one:
df = pd.read_parquet(file_name)
We get an error:
----> 1 df = pd.read_parquet(file_name)
~/venv/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
313
314 impl = get_engine(engine)
--> 315 return impl.read(path, columns=columns, **kwargs)
~/venv/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
129 def read(self, path, columns=None, **kwargs):
130 parquet_ds = self.api.parquet.ParquetDataset(
--> 131 path, filesystem=get_fs_for_path(path), **kwargs
132 )
133 kwargs["columns"] = columns
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in __init__(self, path_or_paths, filesystem, schema, metadata, split_row_groups, validate_schema, filters, metadata_nthreads, read_dictionary, memory_map, buffer_size)
1058
1059 if validate_schema:
-> 1060 self.validate_schemas()
1061
1062 def equals(self, other):
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in validate_schemas(self)
1090 self.schema = self.common_metadata.schema
1091 else:
-> 1092 self.schema = self.pieces[0].get_metadata().schema
1093 elif self.schema is None:
1094 self.schema = self.metadata.schema
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in get_metadata(self)
558 metadata : FileMetaData
559 """
--> 560 f = self.open()
561 return f.metadata
562
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in open(self)
565 Returns instance of ParquetFile
566 """
--> 567 reader = self.open_file_func(self.path)
568 if not isinstance(reader, ParquetFile):
569 reader = ParquetFile(reader, **self.file_options)
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in _open_dataset_file(dataset, path, meta)
940 read_dictionary=dataset.read_dictionary,
941 common_metadata=dataset.common_metadata,
--> 942 buffer_size=dataset.buffer_size
943 )
944
~/venv/lib/python3.6/site-packages/pyarrow/parquet.py in __init__(self, source, metadata, common_metadata, read_dictionary, memory_map, buffer_size)
135 self.reader.open(source, use_memory_map=memory_map,
136 buffer_size=buffer_size,
--> 137 read_dictionary=read_dictionary, metadata=metadata)
138 self.common_metadata = common_metadata
139 self._nested_paths_by_prefix = self._build_nested_paths()
~/venv/lib/python3.6/site-packages/pyarrow/_parquet.pyx in pyarrow._parquet.ParquetReader.open()
~/venv/lib/python3.6/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowIOError: Invalid parquet file. Corrupt footer.
The output from pd.show_versions()
is:
INSTALLED VERSIONS
------------------
commit : None
python : 3.6.8.final.0
python-bits : 64
OS : Linux
OS-release : 5.3.0-61-generic
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.0.4
numpy : 1.17.2
pytz : 2019.3
dateutil : 2.8.0
pip : 20.1.1
setuptools : 41.4.0
Cython : None
pytest : 5.2.1
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.2.2
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 0.15.0
pytables : None
pytest : 5.2.1
pyxlsb : None
s3fs : 0.3.5
scipy : 1.4.1
sqlalchemy : 1.3.10
tables : None
tabulate : 0.8.5
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : 0.46.0
In order to reproduce the error, the DataFrame in S3 must be modified from a different terminal, uploading the modified DataFrame and reading it again from the same terminal will not raise the error.
bradenkinard and spkjess
Metadata
Metadata
Assignees
Labels
DependenciesRequired and optional dependenciesRequired and optional dependencies