From 5d6e3528a0b2cf3bdbb6caa6086218bbefcd0f03 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Fri, 10 Sep 2021 00:51:27 +0530 Subject: [PATCH] Backport PR #43150 on branch 1.3.x (BUG: GroupBy.quantile fails with pd.NA) (#43417) --- doc/source/whatsnew/v1.3.3.rst | 1 + pandas/core/groupby/groupby.py | 4 +++ pandas/tests/groupby/test_quantile.py | 41 +++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index c066d015f5e62..1ecae3d344f79 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) +- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) - Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index edc5d5c6903b4..ede5f5298b129 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -63,6 +63,7 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -2450,6 +2451,9 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: elif is_timedelta64_dtype(vals.dtype): inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 90437b9139594..ebcc31226b895 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -248,6 +248,47 @@ def test_groupby_quantile_skips_invalid_dtype(q): tm.assert_frame_equal(result, expected) +def test_groupby_quantile_NA_float(any_float_allowed_nullable_dtype): + # GH#42849 + df = DataFrame( + {"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_allowed_nullable_dtype + ) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([0.2], dtype=float, index=Index(df["x"][:1]), name="y") + tm.assert_series_equal(expected, result) + + result = df.groupby("x")["y"].quantile([0.5, 0.75]) + expected = pd.Series( + [0.2] * 2, + index=pd.MultiIndex.from_arrays( + [Index(df["x"]), [0.5, 0.75]], names=["x", None] + ), + name="y", + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_quantile_NA_int(any_nullable_int_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_nullable_int_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([3.5], dtype=float, index=Index(df["x"][:1]), name="y") + tm.assert_series_equal(expected, result) + + result = df.groupby("x").quantile(0.5) + expected = DataFrame({"y": 3.5}, index=Index(df["x"][:1])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) +def test_groupby_quantile_allNA_column(dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([np.nan], dtype=float, index=Index(df["x"][:1]), name="y") + tm.assert_series_equal(expected, result) + + def test_groupby_timedelta_quantile(): # GH: 29485 df = DataFrame(