From 2294eb8e6ea8dc0f8e8149ef62e696a4dd8ad308 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 Jul 2023 15:36:16 -0700 Subject: [PATCH] REF/TST: handle boolean dtypes in base extension tests --- pandas/tests/extension/base/groupby.py | 35 +++++- pandas/tests/extension/base/methods.py | 52 ++++++++- pandas/tests/extension/conftest.py | 8 +- pandas/tests/extension/test_arrow.py | 60 +--------- pandas/tests/extension/test_boolean.py | 150 +------------------------ 5 files changed, 91 insertions(+), 214 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bc8781eacfe06..acabcb600ffcc 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -30,15 +30,25 @@ def test_grouping_grouper(self, data_for_grouping): @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] + result = df.groupby("B", as_index=as_index).A.mean() _, uniques = pd.factorize(data_for_grouping, sort=True) + exp_vals = [3.0, 1.0, 4.0] + if is_bool: + exp_vals = exp_vals[:-1] if as_index: index = pd.Index(uniques, name="B") - expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") + expected = pd.Series(exp_vals, index=index, name="A") self.assert_series_equal(result, expected) else: - expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]}) + expected = pd.DataFrame({"B": uniques, "A": exp_vals}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): @@ -83,19 +93,38 @@ def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(self): def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] + result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") - expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A") + exp_vals = [1.0, 3.0, 4.0] + if is_bool: + exp_vals = exp_vals[:-1] + expected = pd.Series(exp_vals, index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): + is_bool = data_for_grouping.dtype._is_boolean + valid = data_for_grouping[~data_for_grouping.isna()] df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values, and the final entry has c==b + # (see data_for_grouping docstring) + df = df.iloc[:-1] result = df.groupby("B").A.transform(len) expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") + if is_bool: + expected = expected[:-1] self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index a868187e5d01c..25bc99e8b9270 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -115,14 +115,22 @@ def test_argsort_missing(self, data_missing_for_sorting): def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): # GH 24382 + is_bool = data_for_sorting.dtype._is_boolean + + exp_argmax = 1 + exp_argmax_repeated = 3 + if is_bool: + # See data_for_sorting docstring + exp_argmax = 0 + exp_argmax_repeated = 1 # data_for_sorting -> [B, C, A] with A < B < C - assert data_for_sorting.argmax() == 1 + assert data_for_sorting.argmax() == exp_argmax assert data_for_sorting.argmin() == 2 # with repeated values -> first occurrence data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) - assert data.argmax() == 3 + assert data.argmax() == exp_argmax_repeated assert data.argmin() == 0 # with missing values @@ -244,8 +252,15 @@ def test_unique(self, data, box, method): def test_factorize(self, data_for_grouping): codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True) - expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp) - expected_uniques = data_for_grouping.take([0, 4, 7]) + + is_bool = data_for_grouping.dtype._is_boolean + if is_bool: + # only 2 unique values + expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp) + expected_uniques = data_for_grouping.take([0, 4]) + else: + expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp) + expected_uniques = data_for_grouping.take([0, 4, 7]) tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) @@ -457,6 +472,9 @@ def test_hash_pandas_object_works(self, data, as_frame): self.assert_equal(a, b) def test_searchsorted(self, data_for_sorting, as_series): + if data_for_sorting.dtype._is_boolean: + return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series) + b, c, a = data_for_sorting arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] @@ -480,6 +498,32 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 2, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): + # We call this from test_searchsorted in cases where we have a + # boolean-like dtype. The non-bool test assumes we have more than 2 + # unique values. + dtype = data_for_sorting.dtype + data_for_sorting = pd.array([True, False], dtype=dtype) + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] cls = type(data) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 0d14128e3bebf..85bbafbeb5129 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -76,6 +76,9 @@ def data_for_sorting(): This should be three items [B, C, A] with A < B < C + + For boolean dtypes (for which there are only 2 values available), + set B=C=True """ raise NotImplementedError @@ -117,7 +120,10 @@ def data_for_grouping(): Expected to be like [B, B, NA, NA, A, A, B, C] - Where A < B < C and NA is missing + Where A < B < C and NA is missing. + + If a dtype has _is_boolean = True, i.e. only 2 unique non-NA entries, + then set C=B. """ raise NotImplementedError diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 197cdc3f436a1..8e4e8f821dd90 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -586,38 +586,6 @@ def test_reduce_series( class TestBaseGroupby(base.BaseGroupbyTests): - def test_groupby_extension_no_sort(self, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - super().test_groupby_extension_no_sort(data_for_grouping) - - def test_groupby_extension_transform(self, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - super().test_groupby_extension_transform(data_for_grouping) - - @pytest.mark.parametrize("as_index", [True, False]) - def test_groupby_extension_agg(self, as_index, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=ValueError, - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - super().test_groupby_extension_agg(as_index, data_for_grouping) - def test_in_numeric_groupby(self, data_for_grouping): dtype = data_for_grouping.dtype if is_string_dtype(dtype): @@ -844,13 +812,7 @@ def test_argmin_argmax( self, data_for_sorting, data_missing_for_sorting, na_value, request ): pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - elif pa.types.is_decimal(pa_dtype) and pa_version_under7p0: + if pa.types.is_decimal(pa_dtype) and pa_version_under7p0: request.node.add_marker( pytest.mark.xfail( reason=f"No pyarrow kernel for {pa_dtype}", @@ -887,16 +849,6 @@ def test_argreduce_series( data_missing_for_sorting, op_name, skipna, expected ) - def test_factorize(self, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - super().test_factorize(data_for_grouping) - _combine_le_expected_dtype = "bool[pyarrow]" def test_combine_add(self, data_repeated, request): @@ -912,16 +864,6 @@ def test_combine_add(self, data_repeated, request): else: super().test_combine_add(data_repeated) - def test_searchsorted(self, data_for_sorting, as_series, request): - pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{pa_dtype} only has 2 unique possible values", - ) - ) - super().test_searchsorted(data_for_sorting, as_series) - def test_basic_equals(self, data): # https://github.com/pandas-dev/pandas/issues/34660 assert pd.Series(data).equals(pd.Series(data)) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 7c27f105b9b45..8a15e27802beb 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -83,8 +83,9 @@ def na_value(): def data_for_grouping(dtype): b = True a = False + c = b na = np.nan - return pd.array([b, b, na, na, a, a, b], dtype=dtype) + return pd.array([b, b, na, na, a, a, b, c], dtype=dtype) class TestDtype(base.BaseDtypeTests): @@ -188,55 +189,6 @@ class TestReshaping(base.BaseReshapingTests): class TestMethods(base.BaseMethodsTests): _combine_le_expected_dtype = "boolean" - def test_factorize(self, data_for_grouping): - # override because we only have 2 unique values - labels, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True) - expected_labels = np.array([0, 0, -1, -1, 1, 1, 0], dtype=np.intp) - expected_uniques = data_for_grouping.take([0, 4]) - - tm.assert_numpy_array_equal(labels, expected_labels) - self.assert_extension_array_equal(uniques, expected_uniques) - - def test_searchsorted(self, data_for_sorting, as_series): - # override because we only have 2 unique values - data_for_sorting = pd.array([True, False], dtype="boolean") - b, a = data_for_sorting - arr = type(data_for_sorting)._from_sequence([a, b]) - - if as_series: - arr = pd.Series(arr) - assert arr.searchsorted(a) == 0 - assert arr.searchsorted(a, side="right") == 1 - - assert arr.searchsorted(b) == 1 - assert arr.searchsorted(b, side="right") == 2 - - result = arr.searchsorted(arr.take([0, 1])) - expected = np.array([0, 1], dtype=np.intp) - - tm.assert_numpy_array_equal(result, expected) - - # sorter - sorter = np.array([1, 0]) - assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): - # override because there are only 2 unique values - - # data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False - assert data_for_sorting.argmax() == 0 - assert data_for_sorting.argmin() == 2 - - # with repeated values -> first occurrence - data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) - assert data.argmax() == 1 - assert data.argmin() == 0 - - # with missing values - # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. - assert data_missing_for_sorting.argmax() == 0 - assert data_missing_for_sorting.argmin() == 2 - class TestCasting(base.BaseCastingTests): pass @@ -248,105 +200,9 @@ class TestGroupby(base.BaseGroupbyTests): unique values, base tests uses 3 groups. """ - def test_grouping_grouper(self, data_for_grouping): - df = pd.DataFrame( - {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping} - ) - gr1 = df.groupby("A").grouper.groupings[0] - gr2 = df.groupby("B").grouper.groupings[0] - - tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) - tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) - - @pytest.mark.parametrize("as_index", [True, False]) - def test_groupby_extension_agg(self, as_index, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - result = df.groupby("B", as_index=as_index).A.mean() - _, uniques = pd.factorize(data_for_grouping, sort=True) - - if as_index: - index = pd.Index(uniques, name="B") - expected = pd.Series([3.0, 1.0], index=index, name="A") - self.assert_series_equal(result, expected) - else: - expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]}) - self.assert_frame_equal(result, expected) - - def test_groupby_agg_extension(self, data_for_grouping): - # GH#38980 groupby agg on extension type fails for non-numeric types - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - - expected = df.iloc[[0, 2, 4]] - expected = expected.set_index("A") - - result = df.groupby("A").agg({"B": "first"}) - self.assert_frame_equal(result, expected) - - result = df.groupby("A").agg("first") - self.assert_frame_equal(result, expected) - - result = df.groupby("A").first() - self.assert_frame_equal(result, expected) - - def test_groupby_extension_no_sort(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - result = df.groupby("B", sort=False).A.mean() - _, index = pd.factorize(data_for_grouping, sort=False) - - index = pd.Index(index, name="B") - expected = pd.Series([1.0, 3.0], index=index, name="A") - self.assert_series_equal(result, expected) - - def test_groupby_extension_transform(self, data_for_grouping): - valid = data_for_grouping[~data_for_grouping.isna()] - df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid}) - - result = df.groupby("B").A.transform(len) - expected = pd.Series([3, 3, 2, 2, 3], name="A") - - self.assert_series_equal(result, expected) - - def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) - - def test_groupby_apply_identity(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - result = df.groupby("A").B.apply(lambda x: x.array) - expected = pd.Series( - [ - df.B.iloc[[0, 1, 6]].array, - df.B.iloc[[2, 3]].array, - df.B.iloc[[4, 5]].array, - ], - index=pd.Index([1, 2, 3], name="A"), - name="B", - ) - self.assert_series_equal(result, expected) - - def test_in_numeric_groupby(self, data_for_grouping): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1], - } - ) - result = df.groupby("A").sum().columns - - if data_for_grouping.dtype._is_numeric: - expected = pd.Index(["B", "C"]) - else: - expected = pd.Index(["C"]) - - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("min_count", [0, 10]) def test_groupby_sum_mincount(self, data_for_grouping, min_count): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping[:-1]}) result = df.groupby("A").sum(min_count=min_count) if min_count == 0: expected = pd.DataFrame(