Skip to content

Commit

Permalink
CLN: Unify signatures in _libs.groupby (#34372)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed Jun 18, 2020
1 parent 3c959fc commit c9144ca
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 20 deletions.
11 changes: 6 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(uint8_t[:] out,
const int64_t[:] labels,
const uint8_t[:] values,
const int64_t[:] labels,
const uint8_t[:] mask,
object val_test,
bint skipna):
Expand Down Expand Up @@ -560,7 +560,8 @@ def _group_var(floating[:, :] out,
int64_t[:] counts,
floating[:, :] values,
const int64_t[:] labels,
Py_ssize_t min_count=-1):
Py_ssize_t min_count=-1,
int64_t ddof=1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
Expand Down Expand Up @@ -600,10 +601,10 @@ def _group_var(floating[:, :] out,
for i in range(ncounts):
for j in range(K):
ct = nobs[i, j]
if ct < 2:
if ct <= ddof:
out[i, j] = NAN
else:
out[i, j] /= (ct - 1)
out[i, j] /= (ct - ddof)


group_var_float32 = _group_var['float']
Expand Down Expand Up @@ -715,8 +716,8 @@ group_ohlc_float64 = _group_ohlc['double']
@cython.boundscheck(False)
@cython.wraparound(False)
def group_quantile(ndarray[float64_t] out,
ndarray[int64_t] labels,
numeric[:] values,
ndarray[int64_t] labels,
ndarray[uint8_t] mask,
float64_t q,
object interpolation):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1736,7 +1736,8 @@ def _wrap_aggregated_output(
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)
name = self._obj_with_exclusions._get_axis(1 - self.axis).name
columns = Index([key.label for key in output], name=name)

result = self.obj._constructor(indexed_output)
result.columns = columns
Expand Down
64 changes: 50 additions & 14 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
return self._get_cythonized_result(
"group_any_all",
aggregate=True,
numeric_only=False,
cython_dtype=np.dtype(np.uint8),
needs_values=True,
needs_mask=True,
Expand Down Expand Up @@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1):
Series or DataFrame
Standard deviation of values within each group.
"""
result = self.var(ddof=ddof)
if result.ndim == 1:
result = np.sqrt(result)
else:
cols = result.columns.get_indexer_for(
result.columns.difference(self.exclusions).unique()
)
# TODO(GH-22046) - setting with iloc broken if labels are not unique
# .values to remove labels
result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values

return result
return self._get_cythonized_result(
"group_var_float64",
aggregate=True,
needs_counts=True,
needs_values=True,
needs_2d=True,
cython_dtype=np.dtype(np.float64),
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
)

@Substitution(name="groupby")
@Appender(_common_see_also)
Expand Down Expand Up @@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None):

return self._get_cythonized_result(
"group_fillna_indexer",
numeric_only=False,
needs_mask=True,
cython_dtype=np.dtype(np.int64),
result_is_index=True,
Expand Down Expand Up @@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
return self._get_cythonized_result(
"group_quantile",
aggregate=True,
numeric_only=False,
needs_values=True,
needs_mask=True,
cython_dtype=np.dtype(np.float64),
Expand Down Expand Up @@ -2367,7 +2368,11 @@ def _get_cythonized_result(
how: str,
cython_dtype: np.dtype,
aggregate: bool = False,
numeric_only: bool = True,
needs_counts: bool = False,
needs_values: bool = False,
needs_2d: bool = False,
min_count: Optional[int] = None,
needs_mask: bool = False,
needs_ngroups: bool = False,
result_is_index: bool = False,
Expand All @@ -2386,9 +2391,18 @@ def _get_cythonized_result(
aggregate : bool, default False
Whether the result should be aggregated to match the number of
groups
numeric_only : bool, default True
Whether only numeric datatypes should be computed
needs_counts : bool, default False
Whether the counts should be a part of the Cython call
needs_values : bool, default False
Whether the values should be a part of the Cython call
signature
needs_2d : bool, default False
Whether the values and result of the Cython call signature
are at least 2-dimensional.
min_count : int, default None
When not None, min_count for the Cython call
needs_mask : bool, default False
Whether boolean mask needs to be part of the Cython call
signature
Expand Down Expand Up @@ -2418,7 +2432,7 @@ def _get_cythonized_result(
if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
if post_processing:
if not callable(pre_processing):
if not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
if pre_processing:
if not callable(pre_processing):
Expand All @@ -2438,21 +2452,39 @@ def _get_cythonized_result(
name = obj.name
values = obj._values

if numeric_only and not is_numeric_dtype(values):
continue

if aggregate:
result_sz = ngroups
else:
result_sz = len(values)

result = np.zeros(result_sz, dtype=cython_dtype)
func = partial(base_func, result, labels)
if needs_2d:
result = result.reshape((-1, 1))
func = partial(base_func, result)

inferences = None

if needs_counts:
counts = np.zeros(self.ngroups, dtype=np.int64)
func = partial(func, counts)

if needs_values:
vals = values
if pre_processing:
vals, inferences = pre_processing(vals)
if needs_2d:
vals = vals.reshape((-1, 1))
vals = vals.astype(cython_dtype, copy=False)
func = partial(func, vals)

func = partial(func, labels)

if min_count is not None:
func = partial(func, min_count)

if needs_mask:
mask = isna(values).view(np.uint8)
func = partial(func, mask)
Expand All @@ -2462,6 +2494,9 @@ def _get_cythonized_result(

func(**kwargs) # Call func to modify indexer values in place

if needs_2d:
result = result.reshape(-1)

if result_is_index:
result = algorithms.take_nd(values, result)

Expand Down Expand Up @@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):

return self._get_cythonized_result(
"group_shift_indexer",
numeric_only=False,
cython_dtype=np.dtype(np.int64),
needs_ngroups=True,
result_is_index=True,
Expand Down

0 comments on commit c9144ca

Please sign in to comment.