Skip to content

Commit

Permalink
BUG: groupby.agg returns incorrect results for uint64 cols (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
Matias Heikkilä committed May 16, 2019
1 parent 1263e1a commit ce22d54
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ Groupby/Resample/Rolling
- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`)
- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`)
- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`)
- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`)


Reshaping
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable,
is_scalar, is_sequence, is_string_like)

from pandas._typing import ArrayLike

_POSSIBLY_CAST_DTYPES = {np.dtype(t).name
for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
'int32', 'uint32', 'int64', 'uint64']}
Expand Down Expand Up @@ -87,10 +89,10 @@ def ensure_categorical(arr):
return arr


def ensure_int64_or_float64(arr, copy=False):
def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array:
"""
Ensure that an dtype array of some integer dtype
has an int64 dtype if possible
has an int64 dtype if possible.
If it's not possible, potentially because of overflow,
convert the array to float64 instead.
Expand All @@ -107,9 +109,18 @@ def ensure_int64_or_float64(arr, copy=False):
out_arr : The input array cast as int64 if
possible without overflow.
Otherwise the input array cast to float64.
Notes
-----
If the array is explicitly of type uint64 the type
will remain unchanged.
"""
try:
return arr.astype('int64', copy=copy, casting='safe')
except TypeError:
pass
try:
return arr.astype('uint64', copy=copy, casting='safe')
except TypeError:
return arr.astype('float64', copy=copy)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_int64_or_float64, ensure_object,
ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
is_timedelta64_dtype, needs_i8_conversion)
Expand Down Expand Up @@ -486,7 +486,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
if (values == iNaT).any():
values = ensure_float64(values)
else:
values = ensure_int64_or_float64(values)
values = ensure_int_or_float(values)
elif is_numeric and not is_complex_dtype(values):
values = ensure_float64(values)
else:
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,16 @@ def test_order_aggregate_multiple_funcs():
expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min'])

tm.assert_index_equal(result, expected)


@pytest.mark.parametrize('dtype', [np.int64, np.uint64])
@pytest.mark.parametrize('how', ['first', 'last', 'min',
'max', 'mean', 'median'])
def test_uint64_type_handling(dtype, how):
# GH 26310
df = pd.DataFrame({'x': 6903052872240755750, 'y': [1, 2]})
expected = df.groupby('y').agg({'x': how})
df.x = df.x.astype(dtype)
result = df.groupby('y').agg({'x': how})
result.x = result.x.astype(np.int64)
tm.assert_frame_equal(result, expected, check_exact=True)

0 comments on commit ce22d54

Please sign in to comment.