Skip to content

Commit

Permalink
Backport fastparquet 0.7 compat (PR #42954 and #42919) (#42987)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Aug 12, 2021
1 parent 0130d77 commit 255b796
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 38 deletions.
4 changes: 2 additions & 2 deletions ci/deps/actions-37-db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- beautifulsoup4
- botocore>=1.11
- dask
- fastparquet>=0.4.0, < 0.7.0
- fastparquet>=0.4.0
- fsspec>=0.7.4, <2021.6.0
- gcsfs>=0.6.0
- geopandas
Expand All @@ -25,7 +25,7 @@ dependencies:
- flask
- nomkl
- numexpr
- numpy=1.17.*
- numpy=1.18.*
- odfpy
- openpyxl
- pandas-gbq
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- blosc
- bottleneck
- fastparquet>=0.4.0, <0.7.0
- fastparquet>=0.4.0
- flask
- fsspec>=0.8.0, <2021.6.0
- matplotlib=3.1.3
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ dependencies:
- xlwt
- odfpy

- fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
- fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow

Expand Down
9 changes: 7 additions & 2 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,17 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
parquet_kwargs: dict[str, Any] = {}
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
if Version(self.api.__version__) >= Version("0.7.1"):
# We are disabling nullable dtypes for fastparquet pending discussion
parquet_kwargs["pandas_nulls"] = False
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
path = stringify_path(path)
parquet_kwargs = {}
handles = None
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
Expand All @@ -337,6 +340,7 @@ def read(
path, "rb", is_text=False, storage_options=storage_options
)
path = handles.handle

parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

result = parquet_file.to_pandas(columns=columns, **kwargs)
Expand Down Expand Up @@ -470,7 +474,8 @@ def read_parquet(
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
for the resulting DataFrame. (only applicable for the ``pyarrow``
engine)
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
Expand Down
81 changes: 50 additions & 31 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
msg = r"parquet must have string column names"
self.check_error_on_write(df, engine, ValueError, msg)

def test_use_nullable_dtypes(self, engine):
import pyarrow.parquet as pq

if engine == "fastparquet":
# We are manually disabling fastparquet's
# nullable dtype support pending discussion
pytest.skip("Fastparquet nullable dtype support is disabled")

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
# Test that nullable dtypes used even in absence of nulls
"e": pyarrow.array([1, 2, 3, 4], "int64"),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path, engine=engine)
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
"e": pd.array([1, 2, 3, 4], dtype="Int64"),
}
)
if engine == "fastparquet":
# Fastparquet doesn't support string columns yet
# Only int and boolean
result2 = result2.drop("c", axis=1)
expected = expected.drop("c", axis=1)
tm.assert_frame_equal(result2, expected)


@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
class TestParquetPyArrow(Base):
Expand Down Expand Up @@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow")
def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
# this should work without error
Expand Down Expand Up @@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
check_round_trip(df, fp, expected=expected)
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
# float64
check_round_trip(df, fp, expected=expected, check_dtype=False)

def test_unsupported(self, fp):

Expand Down Expand Up @@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_use_nullable_dtypes_not_supported(self, fp):
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
df = pd.DataFrame({"a": [1, 2]})

# This is supported now in fastparquet 0.7.1 and above actually
# Still need to ensure that this raises in all versions below
import fastparquet as fp

monkeypatch.setattr(fp, "__version__", "0.4")
with tm.ensure_clean() as path:
df.to_parquet(path)
with pytest.raises(ValueError, match="not supported for the fastparquet"):
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ xlrd
xlsxwriter
xlwt
odfpy
fastparquet>=0.3.2, <0.7.0
fastparquet>=0.3.2
pyarrow>=0.17.0
python-snappy
pyqt5>=5.9.2
Expand Down

0 comments on commit 255b796

Please sign in to comment.