From 5007426f782daecd37f9a8716bee47bcef7d9898 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Mar 2020 14:12:45 -0500 Subject: [PATCH 1/5] REGR: Expand ValueError catching in series aggregate Closes https://github.com/pandas-dev/pandas/issues/31802 This "fixes" #31802 by expanding the number of cases where we swallow an exception in libreduction. Currently, we're creating an invalid Series in SeriesBinGrouper where the `.mgr_locs` doesn't match the values. See https://github.com/pandas-dev/pandas/issues/31802#issuecomment-595954511 for more. For now, we simply catch more cases that fall back to Python. I've gone with a minimal change which addresses only issues hitting this exact exception. We might want to go broader, but that's not clear. --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/core/groupby/ops.py | 9 +++++++- pandas/tests/groupby/test_bin_groupby.py | 27 ++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index eec471f989037..e35f519a2fc55 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) - Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`) - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) - Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..2a2eb135e5472 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -639,9 +639,16 @@ def agg_series(self, obj: Series, func): try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "Function does not reduce" in str(err): + msg = str(err) + if "Function does not reduce" in msg: # raised in libreduction pass + elif "Wrong number of items" in msg: + # https://github.com/pandas-dev/pandas/issues/31802 + # libreduction.SeriesGrouper can create invalid Series / + # Blocks, which might raise arbitrary exceptions when + # operated upon. + pass else: raise return self._aggregate_series_pure_python(obj, func) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ff74d374e5e3f..570ea0dc7d07b 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.common import ensure_int64 +import pandas as pd from pandas import Index, Series, isna import pandas._testing as tm @@ -51,6 +52,32 @@ def test_series_bin_grouper(): tm.assert_almost_equal(counts, exp_counts) +def assert_block_lengths(x): + assert len(x) == len(x._data.blocks[0].mgr_locs) + return 0 + + +def cumsum_max(x): + x.cumsum().max() # triggers the ValueError when creating a block + return 0 + + +@pytest.mark.parametrize( + "func", + [ + cumsum_max, + pytest.param(assert_block_lengths, marks=pytest.mark.xfail(reason="debatable")), + ], +) +def test_operation_on_invalid_block_passes(func): + # https://github.com/pandas-dev/pandas/issues/31802 + # SeriesBinGrouper creates an invalid block, which may + # raise arbitrary exceptions. + df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]}) + result = df.groupby(["A", "B"]).agg(func) + assert isinstance(result, pd.DataFrame) + + @pytest.mark.parametrize( "binner,closed,expected", [ From ad746ba68752dbf0d398dbd7572efe702edd2657 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Mar 2020 16:27:51 -0500 Subject: [PATCH 2/5] update mgr_locs --- pandas/_libs/reduction.pyx | 2 ++ pandas/tests/groupby/test_bin_groupby.py | 20 +++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..29a5a73ef08d0 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -177,6 +177,8 @@ cdef class _BaseGrouper: object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ._data._block, 'mgr_locs', + slice(len(vslider.buf))) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', self.name) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 570ea0dc7d07b..5d0d3be183a52 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -62,20 +62,18 @@ def cumsum_max(x): return 0 -@pytest.mark.parametrize( - "func", - [ - cumsum_max, - pytest.param(assert_block_lengths, marks=pytest.mark.xfail(reason="debatable")), - ], -) -def test_operation_on_invalid_block_passes(func): +@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths]) +def test_mgr_locs_updated(func): # https://github.com/pandas-dev/pandas/issues/31802 - # SeriesBinGrouper creates an invalid block, which may - # raise arbitrary exceptions. + # Some operations may require creating new blocks, which requires + # valid mgr_locs df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]}) result = df.groupby(["A", "B"]).agg(func) - assert isinstance(result, pd.DataFrame) + expected = pd.DataFrame( + {"C": [0, 0]}, + index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]), + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 922b30de38716b9fec88c67b9f06813c0889899b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Mar 2020 20:33:24 -0500 Subject: [PATCH 3/5] revert --- pandas/core/groupby/ops.py | 9 +-------- pandas/tests/groupby/test_bin_groupby.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2a2eb135e5472..7259268ac3f2b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -639,16 +639,9 @@ def agg_series(self, obj: Series, func): try: return self._aggregate_series_fast(obj, func) except ValueError as err: - msg = str(err) - if "Function does not reduce" in msg: + if "Function does not reduce" in str(err): # raised in libreduction pass - elif "Wrong number of items" in msg: - # https://github.com/pandas-dev/pandas/issues/31802 - # libreduction.SeriesGrouper can create invalid Series / - # Blocks, which might raise arbitrary exceptions when - # operated upon. - pass else: raise return self._aggregate_series_pure_python(obj, func) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 5d0d3be183a52..152086c241a52 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -58,7 +58,7 @@ def assert_block_lengths(x): def cumsum_max(x): - x.cumsum().max() # triggers the ValueError when creating a block + x.cumsum().max() return 0 From f63acd3d0bb41ba2b349163156de2684ed8da4d1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Mar 2020 13:17:15 -0700 Subject: [PATCH 4/5] TST: separate out pd.crosstab tests from test_pivot (#32536) --- pandas/tests/reshape/test_crosstab.py | 700 +++++++++++++++++++++++++ pandas/tests/reshape/test_pivot.py | 707 +------------------------- 2 files changed, 701 insertions(+), 706 deletions(-) create mode 100644 pandas/tests/reshape/test_crosstab.py diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py new file mode 100644 index 0000000000000..8795af2e11122 --- /dev/null +++ b/pandas/tests/reshape/test_crosstab.py @@ -0,0 +1,700 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, Series, crosstab +import pandas._testing as tm + + +class TestCrosstab: + def setup_method(self, method): + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" + + def test_crosstab_non_aligned(self): + # GH 17005 + a = Series([0, 1, 1], index=["a", "b", "c"]) + b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) + c = np.array([3, 4, 3]) + + expected = DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) + + result = crosstab(a, b) + tm.assert_frame_equal(result, expected) + + result = crosstab(a, c) + tm.assert_frame_equal(result, expected) + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["All"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("All", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("TOTAL", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + msg = "margins_name argument must be a string" + for margins_name in [666, None, ["a", "b"]]: + with pytest.raises(ValueError, match=msg): + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) + + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) + + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) + tm.assert_frame_equal(table, expected) + + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) + tm.assert_index_equal(res.columns, m) + + def test_crosstab_no_overlap(self): + # GS 10291 + + s1 = Series([1, 2, 3], index=[1, 2, 3]) + s2 = Series([4, 5, 6], index=[4, 5, 6]) + + actual = crosstab(s1, s2) + expected = DataFrame() + + tm.assert_frame_equal(actual, expected) + + def test_margin_dropna(self): + # GH 12577 + # pivot_table counts null into margin ('All') + # when margins=true and dropna=true + + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + # GH 12642 + # _add_margins raises KeyError: Level None not found + # when margins=True and dropna=False + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + def test_crosstab_normalize(self): + # Issue 12578 + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = Index([1, 2], name="a") + cindex = Index([3, 4], name="b") + full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex) + col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) + + # Check all normalize args + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=1), + crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=Index([1, 2], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) + + # Test arrays + crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) + + # Test with aggfunc + norm_counts = DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_counts) + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_sum) + + def test_crosstab_with_empties(self): + # Check handling of empties + df = DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i) + tm.assert_frame_equal(empty, calculated) + + nans = DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) + tm.assert_frame_equal(nans, calculated) + + def test_crosstab_errors(self): + # Issue 12578 + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + error = "values cannot be used without an aggfunc." + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, values=df.c) + + error = "aggfunc cannot be used without values" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, aggfunc=np.mean) + + error = "Not a valid normalize argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="42") + + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize=42) + + error = "Not a valid margins argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="all", margins=42) + + def test_crosstab_with_categorial_columns(self): + # GH 8860 + df = DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = crosstab(df["MAKE"], df["MODEL"]) + + expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) + expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] + expected = DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_with_numpy_size(self): + # GH 4003 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = DataFrame( + expected_data, index=expected_index, columns=expected_column + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_dup_index_names(self): + # GH 13279 + s = Series(range(3), name="foo") + + result = crosstab(s, s) + expected_index = Index(range(3), name="foo") + expected = DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) + def test_crosstab_tuple_name(self, names): + s1 = Series(range(3), name=names[0]) + s2 = Series(range(1, 4), name=names[1]) + + mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names) + expected = Series(1, index=mi).unstack(1, fill_value=0) + + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_both_tuple_names(self): + # GH 18321 + s1 = Series(range(3), name=("a", "b")) + s2 = Series(range(3), name=("c", "d")) + + expected = DataFrame( + np.eye(3, dtype="int64"), + index=Index(range(3), name=("a", "b")), + columns=Index(range(3), name=("c", "d")), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) + result = crosstab(df.index, [df.b, df.a]) + e_idx = Index(["A", "B", "C"], name="row_0") + e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"]) + expected = DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_normalize_multiple_columns(self): + # GH 15150 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": [0] * 24, + "E": [0] * 24, + } + ) + result = crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) + expected = DataFrame( + np.array([0] * 29 + [1], dtype=float).reshape(10, 3), + columns=Index(["bar", "foo", "All"], dtype="object", name="C"), + index=MultiIndex.from_tuples( + [ + ("one", "A"), + ("one", "B"), + ("one", "C"), + ("three", "A"), + ("three", "B"), + ("three", "C"), + ("two", "A"), + ("two", "B"), + ("two", "C"), + ("All", ""), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e09a2a7907177..75c3c565e9d58 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -17,7 +17,7 @@ ) import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -from pandas.core.reshape.pivot import crosstab, pivot_table +from pandas.core.reshape.pivot import pivot_table @pytest.fixture(params=[True, False]) @@ -2064,708 +2064,3 @@ def agg(l): ) with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) - - -class TestCrosstab: - def setup_method(self, method): - df = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) - - self.df = df.append(df, ignore_index=True) - - def test_crosstab_single(self): - df = self.df - result = crosstab(df["A"], df["C"]) - expected = df.groupby(["A", "C"]).size().unstack() - tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) - - def test_crosstab_multiple(self): - df = self.df - - result = crosstab(df["A"], [df["B"], df["C"]]) - expected = df.groupby(["A", "B", "C"]).size() - expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - result = crosstab([df["B"], df["C"]], df["A"]) - expected = df.groupby(["B", "C", "A"]).size() - expected = expected.unstack("A").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - def test_crosstab_ndarray(self): - a = np.random.randint(0, 5, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 10, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) - expected = crosstab(df["a"], [df["b"], df["c"]]) - tm.assert_frame_equal(result, expected) - - result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) - expected = crosstab([df["b"], df["c"]], df["a"]) - tm.assert_frame_equal(result, expected) - - # assign arbitrary names - result = crosstab(self.df["A"].values, self.df["C"].values) - assert result.index.name == "row_0" - assert result.columns.name == "col_0" - - def test_crosstab_non_aligned(self): - # GH 17005 - a = pd.Series([0, 1, 1], index=["a", "b", "c"]) - b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) - c = np.array([3, 4, 3]) - - expected = pd.DataFrame( - [[1, 0], [1, 1]], - index=Index([0, 1], name="row_0"), - columns=Index([3, 4], name="col_0"), - ) - - result = crosstab(a, b) - tm.assert_frame_equal(result, expected) - - result = crosstab(a, c) - tm.assert_frame_equal(result, expected) - - def test_crosstab_margins(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["All", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["All"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("All", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["All"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) - exp_rows.name = "All" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - def test_crosstab_margins_set_margin_name(self): - # GH 15972 - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name="TOTAL", - ) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["TOTAL", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("TOTAL", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["TOTAL"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) - exp_rows.name = "TOTAL" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - msg = "margins_name argument must be a string" - for margins_name in [666, None, ["a", "b"]]: - with pytest.raises(ValueError, match=msg): - crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name=margins_name, - ) - - def test_crosstab_pass_values(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - values = np.random.randn(100) - - table = crosstab( - [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] - ) - - df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) - - expected = df.pivot_table( - "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum - ) - tm.assert_frame_equal(table, expected) - - def test_crosstab_dropna(self): - # GH 3820 - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) - m = MultiIndex.from_tuples( - [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], - names=["b", "c"], - ) - tm.assert_index_equal(res.columns, m) - - def test_crosstab_no_overlap(self): - # GS 10291 - - s1 = pd.Series([1, 2, 3], index=[1, 2, 3]) - s2 = pd.Series([4, 5, 6], index=[4, 5, 6]) - - actual = crosstab(s1, s2) - expected = pd.DataFrame() - - tm.assert_frame_equal(actual, expected) - - def test_margin_dropna(self): - # GH 12577 - # pivot_table counts null into margin ('All') - # when margins=true and dropna=true - - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - # GH 12642 - # _add_margins raises KeyError: Level None not found - # when margins=True and dropna=False - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - - actual = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [ - ["one", "one", "two", "two", "All"], - ["dull", "shiny", "dull", "shiny", ""], - ], - names=["b", "c"], - ) - expected = DataFrame( - [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m - ) - expected.index = Index(["bar", "foo", "All"], name="a") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - def test_crosstab_normalize(self): - # Issue 12578 - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - rindex = pd.Index([1, 2], name="a") - cindex = pd.Index([3, 4], name="b") - full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) - row_normal = pd.DataFrame( - [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex - ) - col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) - - # Check all normalize args - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=1), - pd.crosstab(df.a, df.b, normalize="columns"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=0), - pd.crosstab(df.a, df.b, normalize="index"), - ) - - row_normal_margins = pd.DataFrame( - [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4], name="b", dtype="object"), - ) - col_normal_margins = pd.DataFrame( - [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - - all_normal_margins = pd.DataFrame( - [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="columns", margins=True), - col_normal_margins, - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins - ) - - # Test arrays - pd.crosstab( - [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) - ) - - # Test with aggfunc - norm_counts = pd.DataFrame( - [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_counts) - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} - ) - - norm_sum = pd.DataFrame( - [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_sum) - - def test_crosstab_with_empties(self): - # Check handling of empties - df = pd.DataFrame( - { - "a": [1, 2, 2, 2, 2], - "b": [3, 3, 4, 4, 4], - "c": [np.nan, np.nan, np.nan, np.nan, np.nan], - } - ) - - empty = pd.DataFrame( - [[0.0, 0.0], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - for i in [True, "index", "columns"]: - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=i - ) - tm.assert_frame_equal(empty, calculated) - - nans = pd.DataFrame( - [[0.0, np.nan], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=False - ) - tm.assert_frame_equal(nans, calculated) - - def test_crosstab_errors(self): - # Issue 12578 - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - error = "values cannot be used without an aggfunc." - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, values=df.c) - - error = "aggfunc cannot be used without values" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, aggfunc=np.mean) - - error = "Not a valid normalize argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="42") - - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize=42) - - error = "Not a valid margins argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="all", margins=42) - - def test_crosstab_with_categorial_columns(self): - # GH 8860 - df = pd.DataFrame( - { - "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], - "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], - } - ) - categories = ["Sedan", "Electric", "Pickup"] - df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) - result = pd.crosstab(df["MAKE"], df["MODEL"]) - - expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") - expected_columns = pd.CategoricalIndex( - categories, categories=categories, ordered=False, name="MODEL" - ) - expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_with_numpy_size(self): - # GH 4003 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), - } - ) - result = pd.crosstab( - index=[df["A"], df["B"]], - columns=[df["C"]], - margins=True, - aggfunc=np.size, - values=df["D"], - ) - expected_index = pd.MultiIndex( - levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], - codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], - names=["A", "B"], - ) - expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") - expected_data = np.array( - [ - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [12.0, 12.0, 24.0], - ] - ) - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_column - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_dup_index_names(self): - # GH 13279 - s = pd.Series(range(3), name="foo") - - result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name="foo") - expected = pd.DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) - def test_crosstab_tuple_name(self, names): - s1 = pd.Series(range(3), name=names[0]) - s2 = pd.Series(range(1, 4), name=names[1]) - - mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names) - expected = pd.Series(1, index=mi).unstack(1, fill_value=0) - - result = pd.crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_both_tuple_names(self): - # GH 18321 - s1 = pd.Series(range(3), name=("a", "b")) - s2 = pd.Series(range(3), name=("c", "d")) - - expected = pd.DataFrame( - np.eye(3, dtype="int64"), - index=pd.Index(range(3), name=("a", "b")), - columns=pd.Index(range(3), name=("c", "d")), - ) - result = crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_unsorted_order(self): - df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) - result = pd.crosstab(df.index, [df.b, df.a]) - e_idx = pd.Index(["A", "B", "C"], name="row_0") - e_columns = pd.MultiIndex.from_tuples( - [(1, 4), (2, 6), (3, 5)], names=["b", "a"] - ) - expected = pd.DataFrame( - [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_normalize_multiple_columns(self): - # GH 15150 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": [0] * 24, - "E": [0] * 24, - } - ) - result = pd.crosstab( - [df.A, df.B], - df.C, - values=df.D, - aggfunc=np.sum, - normalize=True, - margins=True, - ) - expected = pd.DataFrame( - np.array([0] * 29 + [1], dtype=float).reshape(10, 3), - columns=Index(["bar", "foo", "All"], dtype="object", name="C"), - index=MultiIndex.from_tuples( - [ - ("one", "A"), - ("one", "B"), - ("one", "C"), - ("three", "A"), - ("three", "B"), - ("three", "C"), - ("two", "A"), - ("two", "B"), - ("two", "C"), - ("All", ""), - ], - names=["A", "B"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_margin_normalize(self): - # GH 27500 - df = pd.DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) - # normalize on index - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 - ) - expected = pd.DataFrame( - [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - expected.columns = Index(["large", "small"], dtype="object", name="C") - tm.assert_frame_equal(result, expected) - - # normalize on columns - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 - ) - expected = pd.DataFrame( - [ - [0.25, 0.2, 0.222222], - [0.25, 0.2, 0.222222], - [0.5, 0.2, 0.333333], - [0, 0.4, 0.222222], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["bar", "foo"], ["one", "two"]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) - - # normalize on both index and column - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True - ) - expected = pd.DataFrame( - [ - [0.111111, 0.111111, 0.222222], - [0.111111, 0.111111, 0.222222], - [0.222222, 0.111111, 0.333333], - [0.000000, 0.222222, 0.222222], - [0.444444, 0.555555, 1], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) From 7e49bd512874adabe6798fbaad7780519ef190c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Mar 2020 14:27:56 -0700 Subject: [PATCH 5/5] CLN: remove Categorical.put (#32554) --- pandas/core/arrays/categorical.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 92859479ec73f..ba4c2e168e0c4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1409,12 +1409,6 @@ def notna(self): notnull = notna - def put(self, *args, **kwargs): - """ - Replace specific elements in the Categorical with given values. - """ - raise NotImplementedError(("'put' is not yet implemented for Categorical")) - def dropna(self): """ Return the Categorical without null values.