From db9a456ef0f88812f341e5a2e401becc8303706a Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Sat, 13 May 2023 16:09:19 +0200 Subject: [PATCH 1/6] filters parameters in pd.read_parqeut --- pandas/io/parquet.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e8670757e1669..e4c9dab3ebb5a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -228,6 +228,7 @@ def read( self, path, columns=None, + filters=None, use_nullable_dtypes: bool = False, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions = None, @@ -257,7 +258,11 @@ def read( ) try: pa_table = self.api.parquet.read_table( - path_or_handle, columns=columns, filesystem=filesystem, **kwargs + path_or_handle, + columns=columns, + filesystem=filesystem, + filters=filters, + **kwargs, ) result = pa_table.to_pandas(**to_pandas_kwargs) @@ -335,6 +340,7 @@ def read( self, path, columns=None, + filters=None, storage_options: StorageOptions = None, filesystem=None, **kwargs, @@ -375,7 +381,7 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, **kwargs) + return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) finally: if handles is not None: handles.close() @@ -483,6 +489,7 @@ def read_parquet( path: FilePath | ReadBuffer[bytes], engine: str = "auto", columns: list[str] | None = None, + filters: list[tuple] | list[list[tuple]] | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -517,6 +524,21 @@ def read_parquet( if you wish to use its implementation. columns : list, default=None If not None, only these columns will be read from the file. + filters : List[Tuple] or List[List[Tuple]], default None + To filter out data. + Filter syntax: [[(column, op, val), ...],...] + where op is [==, =, >, >=, <, <=, !=, in, not in] + The innermost tuples are transposed into a set of filters applied + through an `AND` operation. + The outer list combines these sets of filters through an `OR` + operation. + A single list of tuples can also be used, meaning that no `OR` + operation between set of filters is to be conducted. + + Using this argument will NOT result in row-wise filtering of the final + partitions unless ``engine="pyarrow"`` is also specified. For + other engines, filtering is only performed at the partition level, that is, + to prevent the loading of some row-groups and/or files. {storage_options} @@ -555,6 +577,8 @@ def read_parquet( Returns ------- DataFrame + + """ impl = get_engine(engine) @@ -575,6 +599,7 @@ def read_parquet( return impl.read( path, columns=columns, + filters=filters, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, From 4624beedd1b61e854a2338b29bcf602a75136cb3 Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Sat, 13 May 2023 17:38:56 +0200 Subject: [PATCH 2/6] linter --- pandas/io/parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e4c9dab3ebb5a..671d8d4333f69 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -578,7 +578,6 @@ def read_parquet( ------- DataFrame - """ impl = get_engine(engine) From 9b4439d9abb1fb3dd08670e9a2901b78faf8be0a Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Sun, 14 May 2023 10:44:59 +0200 Subject: [PATCH 3/6] docstring validation --- pandas/io/parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 671d8d4333f69..986bbf686839d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -577,7 +577,6 @@ def read_parquet( Returns ------- DataFrame - """ impl = get_engine(engine) From 4e94179e49d4d33f5c2a8fa18a67d65581e24aa0 Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Wed, 5 Jul 2023 21:54:13 +0200 Subject: [PATCH 4/6] test for filter args in pd.read_parquet --- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 35bf75d3928f8..490f783cbb5a5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -424,6 +424,20 @@ def test_read_columns(self, engine): check_round_trip( df, engine, expected=expected, read_kwargs={"columns": ["string"]} ) + + def test_read_filters(self, engine, tmp_path): + df = pd.DataFrame({"int": list(range(4)), "part": list("aabb"),}) + + expected = pd.DataFrame({"int": [0, 1]}) + check_round_trip( + df, + engine, + path=tmp_path, + expected=expected, + write_kwargs={"partition_cols": ["part"]}, + read_kwargs={"filters": [("part", "==", "a")], "columns":["int"]}, + repeat=1, + ) def test_write_index(self, engine, using_copy_on_write, request): check_names = engine != "fastparquet" From 36cbfe2b3b943bbdae272092cee03e8fc05eec39 Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Wed, 5 Jul 2023 23:59:05 +0200 Subject: [PATCH 5/6] black --- pandas/tests/io/test_parquet.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 490f783cbb5a5..a38c6be12ed89 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -424,9 +424,14 @@ def test_read_columns(self, engine): check_round_trip( df, engine, expected=expected, read_kwargs={"columns": ["string"]} ) - + def test_read_filters(self, engine, tmp_path): - df = pd.DataFrame({"int": list(range(4)), "part": list("aabb"),}) + df = pd.DataFrame( + { + "int": list(range(4)), + "part": list("aabb"), + } + ) expected = pd.DataFrame({"int": [0, 1]}) check_round_trip( @@ -435,7 +440,7 @@ def test_read_filters(self, engine, tmp_path): path=tmp_path, expected=expected, write_kwargs={"partition_cols": ["part"]}, - read_kwargs={"filters": [("part", "==", "a")], "columns":["int"]}, + read_kwargs={"filters": [("part", "==", "a")], "columns": ["int"]}, repeat=1, ) From d324e1b9db9f33918416abb30ef7a72aaec480d2 Mon Sep 17 00:00:00 2001 From: mrastgoo Date: Wed, 2 Aug 2023 09:53:39 +0200 Subject: [PATCH 6/6] addressing reviews --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/parquet.py | 36 ++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6c91c4b512f41..4aa6630fe909f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -669,6 +669,7 @@ I/O ^^^ - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) +- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`) - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 141fc421a2ed6..90d59b0dfcfc8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -489,11 +489,11 @@ def read_parquet( path: FilePath | ReadBuffer[bytes], engine: str = "auto", columns: list[str] | None = None, - filters: list[tuple] | list[list[tuple]] | None = None, storage_options: StorageOptions | None = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, + filters: list[tuple] | list[list[tuple]] | None = None, **kwargs, ) -> DataFrame: """ @@ -524,22 +524,6 @@ def read_parquet( if you wish to use its implementation. columns : list, default=None If not None, only these columns will be read from the file. - filters : List[Tuple] or List[List[Tuple]], default None - To filter out data. - Filter syntax: [[(column, op, val), ...],...] - where op is [==, =, >, >=, <, <=, !=, in, not in] - The innermost tuples are transposed into a set of filters applied - through an `AND` operation. - The outer list combines these sets of filters through an `OR` - operation. - A single list of tuples can also be used, meaning that no `OR` - operation between set of filters is to be conducted. - - Using this argument will NOT result in row-wise filtering of the final - partitions unless ``engine="pyarrow"`` is also specified. For - other engines, filtering is only performed at the partition level, that is, - to prevent the loading of some row-groups and/or files. - {storage_options} .. versionadded:: 1.3.0 @@ -572,6 +556,24 @@ def read_parquet( .. versionadded:: 2.1.0 + filters : List[Tuple] or List[List[Tuple]], default None + To filter out data. + Filter syntax: [[(column, op, val), ...],...] + where op is [==, =, >, >=, <, <=, !=, in, not in] + The innermost tuples are transposed into a set of filters applied + through an `AND` operation. + The outer list combines these sets of filters through an `OR` + operation. + A single list of tuples can also be used, meaning that no `OR` + operation between set of filters is to be conducted. + + Using this argument will NOT result in row-wise filtering of the final + partitions unless ``engine="pyarrow"`` is also specified. For + other engines, filtering is only performed at the partition level, that is, + to prevent the loading of some row-groups and/or files. + + .. versionadded:: 2.1.0 + **kwargs Any additional kwargs are passed to the engine.