From dc830eab5516f6911c894abdb1e3782cb18c503e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 13 Jul 2023 17:40:08 +0100 Subject: [PATCH] ENH: better dtype inference when doing DataFrame reductions (#52788) * ENH: better dtype inference when doing DataFrame reductions * precommit issues * fix failures * fix failures * mypy + some docs * doc linting linting * refactor to use _reduce_with_wrap * docstring linting * pyarrow failure + linting * pyarrow failure + linting * linting * doc stuff * linting fixes * fix fix doc string * remove _wrap_na_result * doc string example * pyarrow + categorical * silence bugs * silence errors * silence errors II * fix errors III * various fixups * various fixups * delay fixing windows and 32bit failures * BUG: Adding a columns to a Frame with RangeIndex columns using a non-scalar key (#52877) * DOC: Update whatsnew (#52882) * CI: Change development python version to 3.10 (#51133) * CI: Change development python version to 3.10 * Update checks * Remove strict * Remove strict * Fixes * Add dt * Switch python to 3.9 * Remove * Fix * Try attribute * Adjust * Fix mypy * Try fixing doc build * Fix mypy * Fix stubtest * Remove workflow file * Rename back * Update * Rename * Rename * Change python version * Remove * Fix doc errors * Remove pypy * Update ci/deps/actions-pypy-39.yaml Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> * Revert pypy removal * Remove again * Fix * Change to 3.9 * Address --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> * update * update * add docs * fix windows tests * fix windows tests * remove guards for 32bit linux * add bool tests + fix 32-bit failures * fix pre-commit failures * fix mypy failures * rename _reduce_with -> _reduce_and_wrap * assert missing attributes * reduction dtypes on windows and 32bit systems * add tests for min_count=0 * PERF:median with axis=1 * median with axis=1 fix * streamline Block.reduce * fix comments * FIX preserve dtype with datetime columns of different resolution when merging (#53213) * BUG Merge not behaving correctly when having `MultiIndex` with a single level (#53215) * fix merge when MultiIndex with single level * resolved conversations * fixed code style * BUG: preserve dtype for right/outer merge of datetime with different resolutions (#53233) * remove special BooleanArray.sum method * remove BooleanArray.prod * fixes * Update doc/source/whatsnew/v2.1.0.rst Co-authored-by: Joris Van den Bossche * Update pandas/core/array_algos/masked_reductions.py Co-authored-by: Joris Van den Bossche * small cleanup * small cleanup * only reduce 1d * fix after #53418 * update according to comments * revome note * update _minmax * REF: add keepdims parameter to ExtensionArray._reduce + remove ExtensionArray._reduce_and_wrap * REF: add keepdims parameter to ExtensionArray._reduce + remove ExtensionArray._reduce_and_wrap * fix whatsnew * fix _reduce call * simplify test * add tests for any/all --------- Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> Co-authored-by: Guillaume Lemaitre Co-authored-by: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Co-authored-by: Joris Van den Bossche --- doc/source/user_guide/integer_na.rst | 3 +- doc/source/whatsnew/v2.1.0.rst | 40 ++++++ pandas/core/array_algos/masked_reductions.py | 6 +- pandas/core/arrays/arrow/array.py | 16 ++- pandas/core/arrays/base.py | 19 ++- pandas/core/arrays/categorical.py | 9 ++ pandas/core/arrays/masked.py | 89 ++++++++++--- pandas/core/arrays/sparse/array.py | 11 +- pandas/core/frame.py | 30 +++-- pandas/core/internals/blocks.py | 3 +- pandas/core/nanops.py | 8 +- .../arrays/categorical/test_analytics.py | 14 ++ pandas/tests/arrays/integer/test_reduction.py | 120 ++++++++++++++++++ pandas/tests/extension/base/dim2.py | 26 ++-- pandas/tests/extension/base/reduce.py | 10 ++ pandas/tests/extension/decimal/array.py | 37 +++--- .../tests/extension/decimal/test_decimal.py | 43 +++++++ pandas/tests/extension/masked_shared.py | 37 ++++++ pandas/tests/extension/test_arrow.py | 34 +++++ pandas/tests/extension/test_boolean.py | 25 ++++ pandas/tests/extension/test_numpy.py | 4 + pandas/tests/frame/test_reductions.py | 108 +++++++++++++++- pandas/tests/groupby/test_apply.py | 3 +- pandas/tests/reshape/merge/test_merge.py | 2 +- 24 files changed, 619 insertions(+), 78 deletions(-) create mode 100644 pandas/tests/arrays/integer/test_reduction.py diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a3ccb5b0d4019..1a727cd78af09 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -126,10 +126,11 @@ These dtypes can be merged, reshaped & casted. pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes df["A"].astype(float) -Reduction and groupby operations such as 'sum' work as well. +Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. .. ipython:: python + df.sum(numeric_only=True) df.sum() df.groupby("B").A.sum() diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b377ed4bfad60..b8b3e11cda63d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,6 +14,46 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_210.enhancements.reduction_extension_dtypes: + +DataFrame reductions preserve extension dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, the results of DataFrame reductions +(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) had numpy dtypes, even when the DataFrames +were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe +columns with a common dtype (:issue:`52788`). + +*Old Behavior* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64") + In [2]: df.sum() + Out[2]: + a 5 + b 9 + dtype: int64 + In [3]: df = df.astype("int64[pyarrow]") + In [4]: df.sum() + Out[4]: + a 5 + b 9 + dtype: int64 + +*New Behavior* + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64") + df.sum() + df = df.astype("int64[pyarrow]") + df.sum() + +Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype. + +To allow Dataframe reductions to preserve extension dtypes, :meth:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :meth:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :meth:`ExtensionArray._reduce`. + .. _whatsnew_210.enhancements.cow: Copy-on-Write improvements diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 60a8d349984b9..335fa1afc0f4e 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -52,7 +52,7 @@ def _reductions( axis : int, optional, default None """ if not skipna: - if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count): + if mask.any() or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: return func(values, axis=axis, **kwargs) @@ -119,11 +119,11 @@ def _minmax( # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: - return func(values) + return func(values, axis=axis) else: subset = values[~mask] if subset.size: - return func(subset) + return func(subset, axis=axis) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6ad07eb04753a..106ec28a93f80 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1508,7 +1508,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs): return result - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): """ Return a scalar result of performing the reduction operation. @@ -1532,12 +1534,16 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): ------ TypeError : subclass does not define reductions """ - result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) + pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) - if pc.is_null(result).as_py(): - return self.dtype.na_value + if keepdims: + result = pa.array([pa_result.as_py()], type=pa_result.type) + return type(self)(result) - return result.as_py() + if pc.is_null(pa_result).as_py(): + return self.dtype.na_value + else: + return pa_result.as_py() def _explode(self): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c6c2b2f14239d..34b2f03681c6a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1535,7 +1535,9 @@ def _accumulate( """ raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): """ Return a scalar result of performing the reduction operation. @@ -1547,6 +1549,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): std, var, sem, kurt, skew }. skipna : bool, default True If True, skip NaN values. + keepdims : bool, default False + If False, a scalar is returned. + If True, the result has dimension with size one along the reduced axis. + + .. versionadded:: 2.1 + + This parameter is not required in the _reduce signature to keep backward + compatibility, but will become required in the future. If the parameter + is not found in the method signature, a FutureWarning will be emitted. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. @@ -1565,7 +1576,11 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): f"'{type(self).__name__}' with dtype {self.dtype} " f"does not support reduction '{name}'" ) - return meth(skipna=skipna, **kwargs) + result = meth(skipna=skipna, **kwargs) + if keepdims: + result = np.array([result]) + + return result # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 # Incompatible types in assignment (expression has type "None", base class diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 46e2b64cb60c6..33cd5fe147d2e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2319,6 +2319,15 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: # ------------------------------------------------------------------ # Reductions + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims: + return type(self)(result, dtype=self.dtype) + else: + return result + def min(self, *, skipna: bool = True, **kwargs): """ The minimum value of the object. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 53afba19222da..b5c686f53597f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -32,6 +32,10 @@ Shape, npt, ) +from pandas.compat import ( + IS64, + is_platform_windows, +) from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -1081,21 +1085,31 @@ def _quantile( # ------------------------------------------------------------------ # Reductions - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: - return getattr(self, name)(skipna=skipna, **kwargs) - - data = self._data - mask = self._mask - - # median, skew, kurt, sem - op = getattr(nanops, f"nan{name}") - result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + result = getattr(self, name)(skipna=skipna, **kwargs) + else: + # median, skew, kurt, sem + data = self._data + mask = self._mask + op = getattr(nanops, f"nan{name}") + axis = kwargs.pop("axis", None) + result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) + + if keepdims: + if isna(result): + return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) + else: + result = result.reshape(1) + mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(result, mask) - if np.isnan(result): + if isna(result): return libmissing.NA - - return result + else: + return result def _wrap_reduction_result(self, name: str, result, *, skipna, axis): if isinstance(result, np.ndarray): @@ -1108,6 +1122,32 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis): return self._maybe_mask_result(result, mask) return result + def _wrap_na_result(self, *, name, axis, mask_size): + mask = np.ones(mask_size, dtype=bool) + + float_dtyp = "float32" if self.dtype == "Float32" else "float64" + if name in ["mean", "median", "var", "std", "skew"]: + np_dtype = float_dtyp + elif name in ["min", "max"] or self.dtype.itemsize == 8: + np_dtype = self.dtype.numpy_dtype.name + else: + is_windows_or_32bit = is_platform_windows() or not IS64 + int_dtyp = "int32" if is_windows_or_32bit else "int64" + uint_dtyp = "uint32" if is_windows_or_32bit else "uint64" + np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[ + self.dtype.kind + ] + + value = np.array([1], dtype=np_dtype) + return self._maybe_mask_result(value, mask=mask) + + def _wrap_min_count_reduction_result( + self, name: str, result, *, skipna, min_count, axis + ): + if min_count == 0 and isinstance(result, np.ndarray): + return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) + return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) + def sum( self, *, @@ -1125,7 +1165,9 @@ def sum( min_count=min_count, axis=axis, ) - return self._wrap_reduction_result("sum", result, skipna=skipna, axis=axis) + return self._wrap_min_count_reduction_result( + "sum", result, skipna=skipna, min_count=min_count, axis=axis + ) def prod( self, @@ -1136,6 +1178,7 @@ def prod( **kwargs, ): nv.validate_prod((), kwargs) + result = masked_reductions.prod( self._data, self._mask, @@ -1143,7 +1186,9 @@ def prod( min_count=min_count, axis=axis, ) - return self._wrap_reduction_result("prod", result, skipna=skipna, axis=axis) + return self._wrap_min_count_reduction_result( + "prod", result, skipna=skipna, min_count=min_count, axis=axis + ) def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_mean((), kwargs) @@ -1183,23 +1228,25 @@ def std( def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_min((), kwargs) - return masked_reductions.min( + result = masked_reductions.min( self._data, self._mask, skipna=skipna, axis=axis, ) + return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis) def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): nv.validate_max((), kwargs) - return masked_reductions.max( + result = masked_reductions.max( self._data, self._mask, skipna=skipna, axis=axis, ) + return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) - def any(self, *, skipna: bool = True, **kwargs): + def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether any element is truthy. @@ -1218,6 +1265,7 @@ def any(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be True if there is at least one element that is truthy, otherwise NA will be returned if there are NA's present. + axis : int, optional, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -1261,7 +1309,6 @@ def any(self, *, skipna: bool = True, **kwargs): >>> pd.array([0, 0, pd.NA]).any(skipna=False) """ - kwargs.pop("axis", None) nv.validate_any((), kwargs) values = self._data.copy() @@ -1280,7 +1327,7 @@ def any(self, *, skipna: bool = True, **kwargs): else: return self.dtype.na_value - def all(self, *, skipna: bool = True, **kwargs): + def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether all elements are truthy. @@ -1299,6 +1346,7 @@ def all(self, *, skipna: bool = True, **kwargs): If `skipna` is False, the result will still be False if there is at least one element that is falsey, otherwise NA will be returned if there are NA's present. + axis : int, optional, default 0 **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. @@ -1342,7 +1390,6 @@ def all(self, *, skipna: bool = True, **kwargs): >>> pd.array([1, 0, pd.NA]).all(skipna=False) False """ - kwargs.pop("axis", None) nv.validate_all((), kwargs) values = self._data.copy() @@ -1352,7 +1399,7 @@ def all(self, *, skipna: bool = True, **kwargs): # bool, int, float, complex, str, bytes, # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] - result = values.all() + result = values.all(axis=axis) if skipna: return result diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 796758ead0c62..18eef6faf88bf 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1384,7 +1384,9 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]: # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): method = getattr(self, name, None) if method is None: @@ -1395,7 +1397,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): else: arr = self.dropna() - return getattr(arr, name)(**kwargs) + result = getattr(arr, name)(**kwargs) + + if keepdims: + return type(self)([result], dtype=self.dtype) + else: + return result def all(self, axis=None, *args, **kwargs): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3a2ad225ae495..e6a93387ebce3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -20,6 +20,7 @@ Sequence, ) import functools +from inspect import signature from io import StringIO import itertools import operator @@ -10893,7 +10894,18 @@ def blk_func(values, axis: Axis = 1): self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, **kwds) + sign = signature(values._reduce) + if "keepdims" in sign.parameters: + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) + else: + warnings.warn( + f"{type(values)}._reduce will require a `keepdims` parameter " + "in the future", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = values._reduce(name, skipna=skipna, **kwds) + return np.array([result]) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -10934,11 +10946,11 @@ def _get_data() -> DataFrame: # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) out = df._constructor_from_mgr(res, axes=res.axes).iloc[0] - if out_dtype is not None: + if out_dtype is not None and out.dtype != "boolean": out = out.astype(out_dtype) - elif (df._mgr.get_dtypes() == object).any(): + elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]: out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): + elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): # Even if we are object dtype, follow numpy and return # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) @@ -11199,10 +11211,9 @@ def idxmin( ) indices = res._values - # indices will always be np.ndarray since axis is not None and + # indices will always be 1d array since axis is not None and # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy + # indices will always be np.ndarray since axis is not N index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] @@ -11229,10 +11240,9 @@ def idxmax( ) indices = res._values - # indices will always be np.ndarray since axis is not None and + # indices will always be 1d array since axis is not None and # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy + assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b4c42804d7484..ae77bd09f8995 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -369,8 +369,7 @@ def reduce(self, func) -> list[Block]: result = func(self.values) if self.values.ndim == 1: - # TODO(EA2D): special case not needed with 2D EAs - res_values = np.array([[result]]) + res_values = result else: res_values = result.reshape(-1, 1) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 59520350e0dc1..3778f2658bbcc 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -805,7 +805,13 @@ def get_median(x, _mask=None): warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) - res = np.nanmedian(values, axis) + if (values.shape[1] == 1 and axis == 0) or ( + values.shape[0] == 1 and axis == 1 + ): + # GH52788: fastpath when squeezable, nanmedian for 2D array slow + res = np.nanmedian(np.squeeze(values), keepdims=True) + else: + res = np.nanmedian(values, axis=axis) else: # must return the correct shape, but median is not defined for the diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 057005b30ae20..c42364d4d4377 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -9,6 +9,7 @@ from pandas import ( Categorical, CategoricalDtype, + DataFrame, Index, NaT, Series, @@ -56,6 +57,19 @@ def test_min_max_ordered(self, index_or_series_or_array): assert np.minimum.reduce(obj) == "d" assert np.maximum.reduce(obj) == "a" + def test_min_max_reduce(self): + # GH52788 + cat = Categorical(["a", "b", "c", "d"], ordered=True) + df = DataFrame(cat) + + result_max = df.agg("max") + expected_max = Series(Categorical(["d"], dtype=cat.dtype)) + tm.assert_series_equal(result_max, expected_max) + + result_min = df.agg("min") + expected_min = Series(Categorical(["a"], dtype=cat.dtype)) + tm.assert_series_equal(result_min, expected_min) + @pytest.mark.parametrize( "categories,expected", [ diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py new file mode 100644 index 0000000000000..5326a8cb0356b --- /dev/null +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + array, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", np.int64(3)], + ["prod", np.int64(2)], + ["min", np.int64(1)], + ["max", np.int64(2)], + ["mean", np.float64(1.5)], + ["median", np.float64(1.5)], + ["var", np.float64(0.5)], + ["std", np.float64(0.5**0.5)], + ["skew", pd.NA], + ["any", True], + ["all", True], + ], +) +def test_series_reductions(op, expected): + ser = Series([1, 2], dtype="Int64") + result = getattr(ser, op)() + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([3], index=["a"], dtype="Int64")], + ["prod", Series([2], index=["a"], dtype="Int64")], + ["min", Series([1], index=["a"], dtype="Int64")], + ["max", Series([2], index=["a"], dtype="Int64")], + ["mean", Series([1.5], index=["a"], dtype="Float64")], + ["median", Series([1.5], index=["a"], dtype="Float64")], + ["var", Series([0.5], index=["a"], dtype="Float64")], + ["std", Series([0.5**0.5], index=["a"], dtype="Float64")], + ["skew", Series([pd.NA], index=["a"], dtype="Float64")], + ["any", Series([True], index=["a"], dtype="boolean")], + ["all", Series([True], index=["a"], dtype="boolean")], + ], +) +def test_dataframe_reductions(op, expected): + df = DataFrame({"a": array([1, 2], dtype="Int64")}) + result = getattr(df, op)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", array([1, 3], dtype="Int64")], + ["prod", array([1, 3], dtype="Int64")], + ["min", array([1, 3], dtype="Int64")], + ["max", array([1, 3], dtype="Int64")], + ["mean", array([1, 3], dtype="Float64")], + ["median", array([1, 3], dtype="Float64")], + ["var", array([pd.NA], dtype="Float64")], + ["std", array([pd.NA], dtype="Float64")], + ["skew", array([pd.NA], dtype="Float64")], + ["any", array([True, True], dtype="boolean")], + ["all", array([True, True], dtype="boolean")], + ], +) +def test_groupby_reductions(op, expected): + df = DataFrame( + { + "A": ["a", "b", "b"], + "B": array([1, None, 3], dtype="Int64"), + } + ) + result = getattr(df.groupby("A"), op)() + expected = DataFrame(expected, index=pd.Index(["a", "b"], name="A"), columns=["B"]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([4, 4], index=["B", "C"], dtype="Float64")], + ["prod", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["min", Series([1, 1], index=["B", "C"], dtype="Float64")], + ["max", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["mean", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["median", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["var", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")], + ["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ], +) +def test_mixed_reductions(op, expected): + df = DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": array([1, None, 3], dtype="Int64"), + } + ) + + # series + result = getattr(df.C, op)() + tm.assert_equal(result, expected["C"]) + + # frame + if op in ["any", "all"]: + result = getattr(df, op)() + else: + result = getattr(df, op)(numeric_only=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 6847c5c183267..b9706f87ab7d3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -206,13 +206,19 @@ def test_reductions_2d_axis_none(self, data, method): assert is_matching_na(result, expected) or result == expected @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis0(self, data, method): + @pytest.mark.parametrize("min_count", [0, 1]) + def test_reductions_2d_axis0(self, data, method, min_count): + if min_count == 1 and method not in ["sum", "prod"]: + pytest.skip(f"min_count not relevant for {method}") + arr2d = data.reshape(1, -1) kwargs = {} if method in ["std", "var"]: # pass ddof=0 so we get all-zero std instead of all-NA std kwargs["ddof"] = 0 + elif method in ["prod", "sum"]: + kwargs["min_count"] = min_count try: result = getattr(arr2d, method)(axis=0, **kwargs) @@ -236,20 +242,22 @@ def get_reduction_result_dtype(dtype): # i.e. dtype.kind == "u" return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] - if method in ["median", "sum", "prod"]: + if method in ["sum", "prod"]: # std and var are not dtype-preserving expected = data - if method in ["sum", "prod"] and data.dtype.kind in "iub": + if data.dtype.kind in "iub": dtype = get_reduction_result_dtype(data.dtype) - expected = data.astype(dtype) - if data.dtype.kind == "b" and method in ["sum", "prod"]: - # We get IntegerArray instead of BooleanArray - pass - else: - assert type(expected) == type(data), type(expected) assert dtype == expected.dtype + if min_count == 0: + fill_value = 1 if method == "prod" else 0 + expected = expected.fillna(fill_value) + + self.assert_extension_array_equal(result, expected) + elif method == "median": + # std and var are not dtype-preserving + expected = data self.assert_extension_array_equal(result, expected) elif method in ["mean", "std", "var"]: if is_integer_dtype(data) or is_bool_dtype(data): diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index cf161a7f4b906..8f3c919cb0957 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_numeric_dtype from pandas.tests.extension.base.base import BaseExtensionTests @@ -66,6 +67,15 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): warnings.simplefilter("ignore", RuntimeWarning) self.check_reduce(s, op_name, skipna) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_frame(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + if not is_numeric_dtype(s): + pytest.skip("not numeric dtype") + + self.check_reduce_frame(s, op_name, skipna) + class BaseBooleanReduceTests(BaseReduceTests): @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 393c01488c234..fc579a50fef78 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -235,24 +235,29 @@ def _formatter(self, boxed=False): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if skipna: + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if skipna and self.isna().any(): # If we don't have any NAs, we can ignore skipna - if self.isna().any(): - other = self[~self.isna()] - return other._reduce(name, **kwargs) - - if name == "sum" and len(self) == 0: + other = self[~self.isna()] + result = other._reduce(name, **kwargs) + elif name == "sum" and len(self) == 0: # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy - return decimal.Decimal(0) - - try: - op = getattr(self.data, name) - except AttributeError as err: - raise NotImplementedError( - f"decimal does not support the {name} operation" - ) from err - return op(axis=0) + result = decimal.Decimal(0) + else: + try: + op = getattr(self.data, name) + except AttributeError as err: + raise NotImplementedError( + f"decimal does not support the {name} operation" + ) from err + result = op(axis=0) + + if keepdims: + return type(self)([result]) + else: + return result def _cmp_method(self, other, op): # For use with OpsMixin diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index afd04817f05c7..3f6b1ec8d20dd 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -115,6 +115,49 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(np.asarray(s), op_name)() tm.assert_almost_equal(result, expected) + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + arr = ser.array + df = pd.DataFrame({"a": arr}) + + if op_name in ["count", "kurt", "sem", "skew", "median"]: + assert not hasattr(arr, op_name) + pytest.skip(f"{op_name} not an array method") + + result1 = arr._reduce(op_name, skipna=skipna, keepdims=True) + result2 = getattr(df, op_name)(skipna=skipna).array + + tm.assert_extension_array_equal(result1, result2) + + if not skipna and ser.isna().any(): + expected = DecimalArray([pd.NA]) + else: + exp_value = getattr(ser.dropna(), op_name)() + expected = DecimalArray([exp_value]) + + tm.assert_extension_array_equal(result1, expected) + + def test_reduction_without_keepdims(self): + # GH52788 + # test _reduce without keepdims + + class DecimalArray2(DecimalArray): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + # no keepdims in signature + return super()._reduce(name, skipna=skipna) + + arr = DecimalArray2([decimal.Decimal(2) for _ in range(100)]) + + ser = pd.Series(arr) + result = ser.agg("sum") + expected = decimal.Decimal(200) + assert result == expected + + df = pd.DataFrame({"a": arr, "b": arr}) + with tm.assert_produces_warning(FutureWarning): + result = df.agg("sum") + expected = pd.Series({"a": 200, "b": 200}, dtype=object) + tm.assert_series_equal(result, expected) + class TestNumericReduce(Reduce, base.BaseNumericReduceTests): pass diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py index 4c6ce20379419..ebbc14d27026c 100644 --- a/pandas/tests/extension/masked_shared.py +++ b/pandas/tests/extension/masked_shared.py @@ -64,6 +64,43 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name in ["count", "kurt", "sem"]: + assert not hasattr(ser.array, op_name) + pytest.skip(f"{op_name} not an array method") + + arr = ser.array + df = pd.DataFrame({"a": arr}) + + is_windows_or_32bit = is_platform_windows() or not IS64 + + if tm.is_float_dtype(arr.dtype): + cmp_dtype = arr.dtype.name + elif op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" + elif op_name in ["max", "min"]: + cmp_dtype = arr.dtype.name + elif arr.dtype in ["Int64", "UInt64"]: + cmp_dtype = arr.dtype.name + elif tm.is_signed_integer_dtype(arr.dtype): + cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + elif tm.is_unsigned_integer_dtype(arr.dtype): + cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + else: + raise TypeError("not supposed to reach this") + + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() + expected = pd.array([exp_value], dtype=cmp_dtype) + + result1 = arr._reduce(op_name, skipna=skipna, keepdims=True) + result2 = getattr(df, op_name)(skipna=skipna).array + + tm.assert_extension_array_equal(result1, result2) + tm.assert_extension_array_equal(result2, expected) + class Accumulation(base.BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 56e35d30ad83c..f622ef770b63f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -508,6 +508,40 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) + def check_reduce_frame(self, ser, op_name, skipna): + arr = ser.array + + if op_name in ["count", "kurt", "sem", "skew"]: + assert not hasattr(arr, op_name) + return + + kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + + if op_name in ["max", "min"]: + cmp_dtype = arr.dtype + elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": + if op_name not in ["median", "var", "std"]: + cmp_dtype = arr.dtype + else: + cmp_dtype = "float64[pyarrow]" + elif op_name in ["median", "var", "std", "mean", "skew"]: + cmp_dtype = "float64[pyarrow]" + else: + cmp_dtype = { + "i": "int64[pyarrow]", + "u": "uint64[pyarrow]", + "f": "float64[pyarrow]", + }[arr.dtype.kind] + result = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs) + + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)(**kwargs) + expected = pd.array([exp_value], dtype=cmp_dtype) + + tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) def test_median_not_approximate(self, typ): # GH 52679 diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index c9fa28a507745..63ae2b629e549 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -370,6 +370,31 @@ def check_reduce(self, s, op_name, skipna): expected = bool(expected) tm.assert_almost_equal(result, expected) + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + arr = ser.array + + if op_name in ["count", "kurt", "sem"]: + assert not hasattr(arr, op_name) + return + + if op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" + elif op_name in ["min", "max"]: + cmp_dtype = "boolean" + elif op_name in ["sum", "prod"]: + is_windows_or_32bit = is_platform_windows() or not IS64 + cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + else: + raise TypeError("not supposed to reach this") + + result = arr._reduce(op_name, skipna=skipna, keepdims=True) + if not skipna and ser.isna().any(): + expected = pd.array([pd.NA], dtype=cmp_dtype) + else: + exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)() + expected = pd.array([exp_value], dtype=cmp_dtype) + tm.assert_extension_array_equal(result, expected) + class TestBooleanReduce(base.BaseBooleanReduceTests): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 16b05be2e0bb9..0392597769930 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -323,6 +323,10 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) + @pytest.mark.skip("tests not written yet") + def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): + pass + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_boolean_reductions, skipna): super().test_reduce_series(data, all_boolean_reductions, skipna) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b4a4324593d22..555d8f1b18797 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import ( + IS64, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -29,6 +32,8 @@ nanops, ) +is_windows_or_is32 = is_platform_windows() or not IS64 + def assert_stat_op_calc( opname, @@ -935,7 +940,7 @@ def test_mean_extensionarray_numeric_only_true(self): arr = np.random.randint(1000, size=(10, 5)) df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) - expected = DataFrame(arr).mean() + expected = DataFrame(arr).mean().astype("Float64") tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1668,6 +1673,101 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): getattr(np, method)(df, axis=0) +class TestEmptyDataFrameReductions: + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", np.int8, 0, np.int64), + ("prod", np.int8, 1, np.int_), + ("sum", np.int64, 0, np.int64), + ("prod", np.int64, 1, np.int64), + ("sum", np.uint8, 0, np.uint64), + ("prod", np.uint8, 1, np.uint), + ("sum", np.uint64, 0, np.uint64), + ("prod", np.uint64, 1, np.uint64), + ("sum", np.float32, 0, np.float32), + ("prod", np.float32, 1, np.float32), + ("sum", np.float64, 0, np.float64), + ], + ) + def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", np.int8, np.float64), + ("prod", np.int8, np.float64), + ("sum", np.int64, np.float64), + ("prod", np.int64, np.float64), + ("sum", np.uint8, np.float64), + ("prod", np.uint8, np.float64), + ("sum", np.uint64, np.float64), + ("prod", np.uint64, np.float64), + ("sum", np.float32, np.float32), + ("prod", np.float32, np.float32), + ("sum", np.float64, np.float64), + ], + ) + def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([np.nan, np.nan], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int64", 0, "Int64"), + ("prod", "Int64", 1, "Int64"), + ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt64", 0, "UInt64"), + ("prod", "UInt64", 1, "UInt64"), + ("sum", "Float32", 0, "Float32"), + ("prod", "Float32", 1, "Float32"), + ("sum", "Float64", 0, "Float64"), + ], + ) + def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", "Int8", ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int64", "Int64"), + ("prod", "Int64", "Int64"), + ("sum", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt64", "UInt64"), + ("prod", "UInt64", "UInt64"), + ("sum", "Float32", "Float32"), + ("prod", "Float32", "Float32"), + ("sum", "Float64", "Float64"), + ], + ) + def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + def test_sum_timedelta64_skipna_false(using_array_manager, request): # GH#17235 if using_array_manager: @@ -1720,7 +1820,9 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") + [getattr(int64_info, method)], + dtype="Int64", + index=Index(["Int64"], dtype="object"), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 832192d8a33e6..a9912d75c8978 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -890,8 +890,7 @@ def test_apply_multi_level_name(category): if category: b = pd.Categorical(b, categories=[1, 2, 3]) expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B") - # GH#40669 - summing an empty frame gives float dtype - expected_values = [20.0, 25.0, 0.0] + expected_values = [20, 25, 0] else: expected_index = Index([1, 2], name="B") expected_values = [20, 25] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a09e29b6eea98..4db88d80ce113 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2819,7 +2819,7 @@ def test_merge_datetime_different_resolution(tz, how): def test_merge_multiindex_single_level(): - # GH #52331 + # GH52331 df = DataFrame({"col": ["A", "B"]}) df2 = DataFrame( data={"b": [100]},