Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure valid Block mutation in SeriesBinGrouper. #32561

Merged
merged 9 commits into from
Mar 11, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Fixed regressions
- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
- Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`)
jreback marked this conversation as resolved.
Show resolved Hide resolved
- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
- Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ cdef class _BaseGrouper:
object.__setattr__(cached_ityp, '_index_data', islider.buf)
cached_ityp._engine.clear_mapping()
object.__setattr__(cached_typ._data._block, 'values', vslider.buf)
object.__setattr__(cached_typ._data._block, 'mgr_locs',
slice(len(vslider.buf)))
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
object.__setattr__(cached_typ, '_index', cached_ityp)
object.__setattr__(cached_typ, 'name', self.name)

Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from pandas.core.dtypes.common import ensure_int64

import pandas as pd
from pandas import Index, Series, isna
import pandas._testing as tm

Expand Down Expand Up @@ -51,6 +52,30 @@ def test_series_bin_grouper():
tm.assert_almost_equal(counts, exp_counts)


def assert_block_lengths(x):
assert len(x) == len(x._data.blocks[0].mgr_locs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this fails, the assertion errors bubbles up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah

diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
index 152086c241..b6518c1962 100644
--- a/pandas/tests/groupby/test_bin_groupby.py
+++ b/pandas/tests/groupby/test_bin_groupby.py
@@ -53,7 +53,7 @@ def test_series_bin_grouper():
 
 
 def assert_block_lengths(x):
-    assert len(x) == len(x._data.blocks[0].mgr_locs)
+    assert len(x) == len(x._data.blocks[0].mgr_locs) + 1
     return 0
___________________________________________________________________ test_mgr_locs_updated[assert_block_lengths] ____________________________________________________________________

func = <function assert_block_lengths at 0x122354b90>

    @pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths])
    def test_mgr_locs_updated(func):
        # https://github.com/pandas-dev/pandas/issues/31802
        # Some operations may require creating new blocks, which requires
        # valid mgr_locs
        df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
>       result = df.groupby(["A", "B"]).agg(func)

pandas/tests/groupby/test_bin_groupby.py:71:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pandas/core/groupby/generic.py:939: in aggregate
    return self._python_agg_general(func, *args, **kwargs)
pandas/core/groupby/groupby.py:926: in _python_agg_general
    result, counts = self.grouper.agg_series(obj, f)
pandas/core/groupby/ops.py:640: in agg_series
    return self._aggregate_series_fast(obj, func)
pandas/core/groupby/ops.py:665: in _aggregate_series_fast
    result, counts = grouper.get_result()
pandas/_libs/reduction.pyx:377: in pandas._libs.reduction.SeriesGrouper.get_result
    res, initialized = self._apply_to_group(cached_typ, cached_ityp,
pandas/_libs/reduction.pyx:195: in pandas._libs.reduction._BaseGrouper._apply_to_group
    res = self.f(cached_typ)
pandas/core/groupby/groupby.py:913: in <lambda>
    f = lambda x: func(x, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

x = Series([], Name: C, dtype: int64)

    def assert_block_lengths(x):
>       assert len(x) == len(x._data.blocks[0].mgr_locs) + 1
E       assert 1 == (1 + 1)
E        +  where 1 = len(0    1\nName: C, dtype: int64)
E        +  and   1 = len(BlockPlacement(slice(0, 1, 1)))
E        +    where BlockPlacement(slice(0, 1, 1)) = IntBlock: 1 dtype: int64.mgr_locs

pandas/tests/groupby/test_bin_groupby.py:56: AssertionError
==================================================================== 1 failed, 1 passed, 9 deselected in 0.24s =====================================================================

return 0


def cumsum_max(x):
x.cumsum().max()
return 0
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths])
def test_mgr_locs_updated(func):
# https://github.com/pandas-dev/pandas/issues/31802
# Some operations may require creating new blocks, which requires
# valid mgr_locs
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
result = df.groupby(["A", "B"]).agg(func)
expected = pd.DataFrame(
{"C": [0, 0]},
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"binner,closed,expected",
[
Expand Down