CLN: groupby test (#58777)

* Clean test_cumulative * Clean test_counting * Clean test_filters * Undo change
pandas-dev · May 20, 2024 · 03d86d6 · 03d86d6
1 parent 6a7b3da
commit 03d86d6
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 63 deletions.
diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py
@@ -321,31 +321,33 @@ def test_count_object():
     expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
     tm.assert_series_equal(result, expected)
 
+
+def test_count_object_nan():
     df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
     result = df.groupby("c").a.count()
     expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
     tm.assert_series_equal(result, expected)
 
 
-def test_count_cross_type():
+@pytest.mark.parametrize("typ", ["object", "float32"])
+def test_count_cross_type(typ):
     # GH8169
     # Set float64 dtype to avoid upcast when setting nan below
     vals = np.hstack(
         (
-            np.random.default_rng(2).integers(0, 5, (100, 2)),
-            np.random.default_rng(2).integers(0, 2, (100, 2)),
+            np.random.default_rng(2).integers(0, 5, (10, 2)),
+            np.random.default_rng(2).integers(0, 2, (10, 2)),
         )
     ).astype("float64")
 
     df = DataFrame(vals, columns=["a", "b", "c", "d"])
     df[df == 2] = np.nan
     expected = df.groupby(["c", "d"]).count()
 
-    for t in ["float32", "object"]:
-        df["a"] = df["a"].astype(t)
-        df["b"] = df["b"].astype(t)
-        result = df.groupby(["c", "d"]).count()
-        tm.assert_frame_equal(result, expected)
+    df["a"] = df["a"].astype(typ)
+    df["b"] = df["b"].astype(typ)
+    result = df.groupby(["c", "d"]).count()
+    tm.assert_frame_equal(result, expected)
 
 
 def test_lower_int_prec_count():

diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py
@@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns():
 
 def test_cummin(dtypes_for_minmax):
     dtype = dtypes_for_minmax[0]
-    min_val = dtypes_for_minmax[1]
 
     # GH 15048
     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
     expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
 
     df = base_df.astype(dtype)
-
     expected = DataFrame({"B": expected_mins}).astype(dtype)
     result = df.groupby("A").cummin()
     tm.assert_frame_equal(result, expected)
     result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
     tm.assert_frame_equal(result, expected)
 
-    # Test w/ min value for dtype
+
+def test_cummin_min_value_for_dtype(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    min_val = dtypes_for_minmax[1]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+    expected = DataFrame({"B": expected_mins}).astype(dtype)
+    df = base_df.astype(dtype)
     df.loc[[2, 6], "B"] = min_val
     df.loc[[1, 5], "B"] = min_val + 1
     expected.loc[[2, 3, 6, 7], "B"] = min_val
@@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax):
     )
     tm.assert_frame_equal(result, expected, check_exact=True)
 
-    # Test nan in some values
+
+def test_cummin_nan_in_some_values(dtypes_for_minmax):
     # Explicit cast to float to avoid implicit cast when setting nan
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
     base_df = base_df.astype({"B": "float"})
     base_df.loc[[0, 2, 4, 6], "B"] = np.nan
     expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
@@ -132,13 +141,17 @@ def test_cummin(dtypes_for_minmax):
     )
     tm.assert_frame_equal(result, expected)
 
+
+def test_cummin_datetime():
     # GH 15561
     df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
     expected = Series(pd.to_datetime("2001"), index=[0], name="b")
 
     result = df.groupby("a")["b"].cummin()
     tm.assert_series_equal(expected, result)
 
+
+def test_cummin_getattr_series():
     # GH 15635
     df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
     result = df.groupby("a").b.cummin()
@@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype):
 
 def test_cummax(dtypes_for_minmax):
     dtype = dtypes_for_minmax[0]
-    max_val = dtypes_for_minmax[2]
 
     # GH 15048
     base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
@@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax):
     result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
     tm.assert_frame_equal(result, expected)
 
-    # Test w/ max value for dtype
+
+def test_cummax_min_value_for_dtype(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    max_val = dtypes_for_minmax[2]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+    df = base_df.astype(dtype)
     df.loc[[2, 6], "B"] = max_val
+    expected = DataFrame({"B": expected_maxs}).astype(dtype)
     expected.loc[[2, 3, 6, 7], "B"] = max_val
     result = df.groupby("A").cummax()
     tm.assert_frame_equal(result, expected)
@@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax):
     )
     tm.assert_frame_equal(result, expected)
 
+
+def test_cummax_nan_in_some_values(dtypes_for_minmax):
     # Test nan in some values
     # Explicit cast to float to avoid implicit cast when setting nan
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
     base_df = base_df.astype({"B": "float"})
     base_df.loc[[0, 2, 4, 6], "B"] = np.nan
     expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
@@ -199,13 +224,17 @@ def test_cummax(dtypes_for_minmax):
     )
     tm.assert_frame_equal(result, expected)
 
+
+def test_cummax_datetime():
     # GH 15561
     df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
     expected = Series(pd.to_datetime("2001"), index=[0], name="b")
 
     result = df.groupby("a")["b"].cummax()
     tm.assert_series_equal(expected, result)
 
+
+def test_cummax_getattr_series():
     # GH 15635
     df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
     result = df.groupby("a").b.cummax()
@@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val):
     tm.assert_frame_equal(result, expected)
 
 
-def test_cython_api2():
+def test_cython_api2(as_index):
     # this takes the fast apply path
 
     # cumsum (GH5614)
+    # GH 5755 - cumsum is a transformer and should ignore as_index
     df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
     expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
-    result = df.groupby("A").cumsum()
-    tm.assert_frame_equal(result, expected)
-
-    # GH 5755 - cumsum is a transformer and should ignore as_index
-    result = df.groupby("A", as_index=False).cumsum()
+    result = df.groupby("A", as_index=as_index).cumsum()
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py
@@ -85,6 +85,9 @@ def test_filter_out_no_groups():
     grouped = s.groupby(grouper)
     filtered = grouped.filter(lambda x: x.mean() > 0)
     tm.assert_series_equal(filtered, s)
+
+
+def test_filter_out_no_groups_dataframe():
     df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
     grouper = df["A"].apply(lambda x: x % 2)
     grouped = df.groupby(grouper)
@@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df():
     expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
     tm.assert_frame_equal(expected, res)
 
+
+def test_filter_out_all_groups_in_df_dropna_true():
+    # GH12768
     df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
     res = df.groupby("a")
     res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
@@ -179,7 +185,7 @@ def test_filter_pdna_is_false():
 
 def test_filter_against_workaround_ints():
     # Series of ints
-    s = Series(np.random.default_rng(2).integers(0, 100, 100))
+    s = Series(np.random.default_rng(2).integers(0, 100, 10))
     grouper = s.apply(lambda x: np.round(x, -1))
     grouped = s.groupby(grouper)
     f = lambda x: x.mean() > 10
@@ -191,7 +197,7 @@ def test_filter_against_workaround_ints():
 
 def test_filter_against_workaround_floats():
     # Series of floats
-    s = 100 * Series(np.random.default_rng(2).random(100))
+    s = 100 * Series(np.random.default_rng(2).random(10))
     grouper = s.apply(lambda x: np.round(x, -1))
     grouped = s.groupby(grouper)
     f = lambda x: x.mean() > 10
@@ -203,40 +209,40 @@ def test_filter_against_workaround_floats():
 def test_filter_against_workaround_dataframe():
     # Set up DataFrame of ints, floats, strings.
     letters = np.array(list(ascii_lowercase))
-    N = 100
+    N = 10
     random_letters = letters.take(
         np.random.default_rng(2).integers(0, 26, N, dtype=int)
     )
     df = DataFrame(
         {
-            "ints": Series(np.random.default_rng(2).integers(0, 100, N)),
+            "ints": Series(np.random.default_rng(2).integers(0, 10, N)),
             "floats": N / 10 * Series(np.random.default_rng(2).random(N)),
             "letters": Series(random_letters),
         }
     )
 
     # Group by ints; filter on floats.
     grouped = df.groupby("ints")
-    old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
-    new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
+    old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
     tm.assert_frame_equal(new_way, old_way)
 
     # Group by floats (rounded); filter on strings.
     grouper = df.floats.apply(lambda x: np.round(x, -1))
     grouped = df.groupby(grouper)
-    old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
-    new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
+    old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
     tm.assert_frame_equal(new_way, old_way)
 
     # Group by strings; filter on ints.
     grouped = df.groupby("letters")
-    old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
-    new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
+    old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
     tm.assert_frame_equal(new_way, old_way)
 
 
 def test_filter_using_len():
-    # BUG GH4447
+    # GH 4447
     df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
     grouped = df.groupby("B")
     actual = grouped.filter(lambda x: len(x) > 2)
@@ -250,8 +256,10 @@ def test_filter_using_len():
     expected = df.loc[[]]
     tm.assert_frame_equal(actual, expected)
 
-    # Series have always worked properly, but we'll test anyway.
-    s = df["B"]
+
+def test_filter_using_len_series():
+    # GH 4447
+    s = Series(list("aabbbbcc"), name="B")
     grouped = s.groupby(s)
     actual = grouped.filter(lambda x: len(x) > 2)
     expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
@@ -262,10 +270,14 @@ def test_filter_using_len():
     tm.assert_series_equal(actual, expected)
 
 
-def test_filter_maintains_ordering():
-    # Simple case: index is sequential. #4621
+@pytest.mark.parametrize(
+    "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
+)
+def test_filter_maintains_ordering(index):
+    # GH 4621
     df = DataFrame(
-        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
     )
     s = df["pid"]
     grouped = df.groupby("tag")
@@ -278,33 +290,6 @@ def test_filter_maintains_ordering():
     expected = s.iloc[[1, 2, 4, 7]]
     tm.assert_series_equal(actual, expected)
 
-    # Now index is sequentially decreasing.
-    df.index = np.arange(len(df) - 1, -1, -1)
-    s = df["pid"]
-    grouped = df.groupby("tag")
-    actual = grouped.filter(lambda x: len(x) > 1)
-    expected = df.iloc[[1, 2, 4, 7]]
-    tm.assert_frame_equal(actual, expected)
-
-    grouped = s.groupby(df["tag"])
-    actual = grouped.filter(lambda x: len(x) > 1)
-    expected = s.iloc[[1, 2, 4, 7]]
-    tm.assert_series_equal(actual, expected)
-
-    # Index is shuffled.
-    SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
-    df.index = df.index[SHUFFLED]
-    s = df["pid"]
-    grouped = df.groupby("tag")
-    actual = grouped.filter(lambda x: len(x) > 1)
-    expected = df.iloc[[1, 2, 4, 7]]
-    tm.assert_frame_equal(actual, expected)
-
-    grouped = s.groupby(df["tag"])
-    actual = grouped.filter(lambda x: len(x) > 1)
-    expected = s.iloc[[1, 2, 4, 7]]
-    tm.assert_series_equal(actual, expected)
-
 
 def test_filter_multiple_timestamp():
     # GH 10114