Skip to content

Commit

Permalink
CLN: groupby test (#58777)
Browse files Browse the repository at this point in the history
* Clean test_cumulative

* Clean test_counting

* Clean test_filters

* Undo change
  • Loading branch information
mroeschke committed May 20, 2024
1 parent 6a7b3da commit 03d86d6
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 63 deletions.
18 changes: 10 additions & 8 deletions pandas/tests/groupby/test_counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,31 +321,33 @@ def test_count_object():
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)


def test_count_object_nan():
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)


def test_count_cross_type():
@pytest.mark.parametrize("typ", ["object", "float32"])
def test_count_cross_type(typ):
# GH8169
# Set float64 dtype to avoid upcast when setting nan below
vals = np.hstack(
(
np.random.default_rng(2).integers(0, 5, (100, 2)),
np.random.default_rng(2).integers(0, 2, (100, 2)),
np.random.default_rng(2).integers(0, 5, (10, 2)),
np.random.default_rng(2).integers(0, 2, (10, 2)),
)
).astype("float64")

df = DataFrame(vals, columns=["a", "b", "c", "d"])
df[df == 2] = np.nan
expected = df.groupby(["c", "d"]).count()

for t in ["float32", "object"]:
df["a"] = df["a"].astype(t)
df["b"] = df["b"].astype(t)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)
df["a"] = df["a"].astype(typ)
df["b"] = df["b"].astype(typ)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)


def test_lower_int_prec_count():
Expand Down
50 changes: 38 additions & 12 deletions pandas/tests/groupby/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns():

def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]

df = base_df.astype(dtype)

expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ min value for dtype

def test_cummin_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected = DataFrame({"B": expected_mins}).astype(dtype)
df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
Expand All @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected, check_exact=True)

# Test nan in some values

def test_cummin_nan_in_some_values(dtypes_for_minmax):
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
Expand All @@ -132,13 +141,17 @@ def test_cummin(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummin_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)


def test_cummin_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
Expand All @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype):

def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
Expand All @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax):
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype

def test_cummax_min_value_for_dtype(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)
df.loc[[2, 6], "B"] = max_val
expected = DataFrame({"B": expected_maxs}).astype(dtype)
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
Expand All @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummax_nan_in_some_values(dtypes_for_minmax):
# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
Expand All @@ -199,13 +224,17 @@ def test_cummax(dtypes_for_minmax):
)
tm.assert_frame_equal(result, expected)


def test_cummax_datetime():
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)


def test_cummax_getattr_series():
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
Expand Down Expand Up @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val):
tm.assert_frame_equal(result, expected)


def test_cython_api2():
def test_cython_api2(as_index):
# this takes the fast apply path

# cumsum (GH5614)
# GH 5755 - cumsum is a transformer and should ignore as_index
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A").cumsum()
tm.assert_frame_equal(result, expected)

# GH 5755 - cumsum is a transformer and should ignore as_index
result = df.groupby("A", as_index=False).cumsum()
result = df.groupby("A", as_index=as_index).cumsum()
tm.assert_frame_equal(result, expected)
71 changes: 28 additions & 43 deletions pandas/tests/groupby/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def test_filter_out_no_groups():
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)


def test_filter_out_no_groups_dataframe():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
Expand All @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df():
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)


def test_filter_out_all_groups_in_df_dropna_true():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
Expand Down Expand Up @@ -179,7 +185,7 @@ def test_filter_pdna_is_false():

def test_filter_against_workaround_ints():
# Series of ints
s = Series(np.random.default_rng(2).integers(0, 100, 100))
s = Series(np.random.default_rng(2).integers(0, 100, 10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
Expand All @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints():

def test_filter_against_workaround_floats():
# Series of floats
s = 100 * Series(np.random.default_rng(2).random(100))
s = 100 * Series(np.random.default_rng(2).random(10))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
Expand All @@ -203,40 +209,40 @@ def test_filter_against_workaround_floats():
def test_filter_against_workaround_dataframe():
# Set up DataFrame of ints, floats, strings.
letters = np.array(list(ascii_lowercase))
N = 100
N = 10
random_letters = letters.take(
np.random.default_rng(2).integers(0, 26, N, dtype=int)
)
df = DataFrame(
{
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
"ints": Series(np.random.default_rng(2).integers(0, 10, N)),
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
"letters": Series(random_letters),
}
)

# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)

# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
tm.assert_frame_equal(new_way, old_way)

# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
tm.assert_frame_equal(new_way, old_way)


def test_filter_using_len():
# BUG GH4447
# GH 4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
Expand All @@ -250,8 +256,10 @@ def test_filter_using_len():
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)

# Series have always worked properly, but we'll test anyway.
s = df["B"]

def test_filter_using_len_series():
# GH 4447
s = Series(list("aabbbbcc"), name="B")
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
Expand All @@ -262,10 +270,14 @@ def test_filter_using_len():
tm.assert_series_equal(actual, expected)


def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
@pytest.mark.parametrize(
"index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
)
def test_filter_maintains_ordering(index):
# GH 4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
s = df["pid"]
grouped = df.groupby("tag")
Expand All @@ -278,33 +290,6 @@ def test_filter_maintains_ordering():
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)

# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)

grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)

# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)

grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)


def test_filter_multiple_timestamp():
# GH 10114
Expand Down

0 comments on commit 03d86d6

Please sign in to comment.