diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py index a86e8ed623ef7..d7a2853e1e7b2 100644 --- a/bench/bench_groupby.py +++ b/bench/bench_groupby.py @@ -47,7 +47,8 @@ def g(): from pandas.core.groupby import get_group_index -group_index = get_group_index(label_list, shape).astype('i4') +group_index = get_group_index(label_list, shape, + sort=True, xnull=True).astype('i4') ngroups = np.prod(shape) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index f7764fe173ec0..9485ef18dbd6f 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -156,6 +156,7 @@ Bug Fixes - Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`) - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`). - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`). +- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`). - Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`). diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cb5dedc887bca..30ca6185ddccb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1367,30 +1367,16 @@ def group_info(self): def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] - if self._overflow_possible: - tups = lib.fast_zip(all_labels) - labs, uniques = algos.factorize(tups) + if len(all_labels) > 1: + group_index = get_group_index(all_labels, self.shape, + sort=True, xnull=True) + return _compress_group_index(group_index) - if self.sort: - uniques, labs = _reorder_by_uniques(uniques, labs) + ping = self.groupings[0] + self.compressed = False + self._filter_empty_groups = False - return labs, uniques - else: - if len(all_labels) > 1: - group_index = get_group_index(all_labels, self.shape) - comp_ids, obs_group_ids = _compress_group_index(group_index) - else: - ping = self.groupings[0] - comp_ids = ping.labels - obs_group_ids = np.arange(len(ping.group_index)) - self.compressed = False - self._filter_empty_groups = False - - return comp_ids, obs_group_ids - - @cache_readonly - def _overflow_possible(self): - return _int64_overflow_possible(self.shape) + return ping.labels, np.arange(len(ping.group_index)) @cache_readonly def ngroups(self): @@ -1402,15 +1388,13 @@ def result_index(self): return MultiIndex.from_arrays(recons, names=self.names) def get_group_levels(self): - obs_ids = self.group_info[1] + comp_ids, obs_ids, _ = self.group_info if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].group_index] - if self._overflow_possible: - recons_labels = [np.array(x) for x in zip(*obs_ids)] - else: - recons_labels = decons_group_index(obs_ids, self.shape) + recons_labels = decons_obs_group_ids(comp_ids, obs_ids, + self.shape, (ping.labels for ping in self.groupings)) name_list = [] for ping, labels in zip(self.groupings, recons_labels): @@ -3490,32 +3474,16 @@ def get_splitter(data, *args, **kwargs): # Misc utilities -def get_group_index(label_list, shape): +def get_group_index(labels, shape, sort, xnull): """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label - combinations. - """ - if len(label_list) == 1: - return label_list[0] - - n = len(label_list[0]) - group_index = np.zeros(n, dtype=np.int64) - mask = np.zeros(n, dtype=bool) - for i in range(len(shape)): - stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64) - group_index += com._ensure_int64(label_list[i]) * stride - mask |= label_list[i] < 0 - - np.putmask(group_index, mask, -1) - return group_index - - -def get_flat_ids(labels, shape, retain_lex_rank): - """ - Given a list of labels at each level, returns a flat array of int64 ids - corresponding to unique tuples across the labels. If `retain_lex_rank`, - rank of returned ids preserve lexical ranks of labels. + combinations, *as long as* this space fits within int64 bounds; + otherwise, though group indices identify unique combinations of + labels, they cannot be deconstructed. + - If `sort`, rank of returned ids preserve lexical ranks of labels. + i.e. returned id's can be used to do lexical sort on labels; + - If `xnull` nulls (-1 labels) are passed through. Parameters ---------- @@ -3523,9 +3491,11 @@ def get_flat_ids(labels, shape, retain_lex_rank): Integers identifying levels at each location shape: sequence of ints same length as labels Number of unique levels at each location - retain_lex_rank: boolean + sort: boolean If the ranks of returned ids should match lexical ranks of labels - + xnull: boolean + If true nulls are eXcluded. i.e. -1 values in the labels are + passed through Returns ------- An array of type int64 where two elements are equal if their corresponding @@ -3544,12 +3514,18 @@ def loop(labels, shape): stride //= shape[i] out += labels[i] * stride + if xnull: # exclude nulls + mask = labels[0] == -1 + for lab in labels[1:nlev]: + mask |= lab == -1 + out[mask] = -1 + if nlev == len(shape): # all levels done! return out # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted - comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank) + comp_ids, obs_ids = _compress_group_index(out, sort=sort) labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] @@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) labels = map(com._ensure_int64, labels) - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - return loop(labels, shape) + return loop(list(labels), list(shape)) _INT64_MAX = np.iinfo(np.int64).max @@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape): def decons_group_index(comp_labels, shape): # reconstruct labels + if _int64_overflow_possible(shape): + # at some point group indices are factorized, + # and may not be deconstructed here! wrong path! + raise ValueError('cannot deconstruct factorized group indices!') + label_list = [] factor = 1 y = 0 @@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape): return label_list[::-1] +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels): + """reconstruct labels from observed ids""" + from pandas.hashtable import unique_label_indices + + if not _int64_overflow_possible(shape): + # obs ids are deconstructable! take the fast route! + return decons_group_index(obs_ids, shape) + + i = unique_label_indices(comp_ids) + i8copy = lambda a: a.astype('i8', subok=False, copy=True) + return [i8copy(lab[i]) for lab in labels] + + def _indexer_from_factorized(labels, shape, compress=True): if _int64_overflow_possible(shape): indexer = np.lexsort(np.array(labels[::-1])) return indexer - group_index = get_group_index(labels, shape) + group_index = get_group_index(labels, shape, sort=True, xnull=True) if compress: comp_ids, obs_ids = _compress_group_index(group_index) @@ -3712,9 +3707,12 @@ def get_key(self, comp_id): def _get_indices_dict(label_list, keys): shape = list(map(len, keys)) - ngroups = np.prod(shape) - group_index = get_group_index(label_list, shape) + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + ngroups = ((group_index.size and group_index.max()) + 1) \ + if _int64_overflow_possible(shape) \ + else np.prod(shape, dtype='i8') + sorter = _get_group_index_sorter(group_index, ngroups) sorted_labels = [lab.take(sorter) for lab in label_list] diff --git a/pandas/core/index.py b/pandas/core/index.py index 1b4a691851a8a..63d8554cbaa03 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3229,11 +3229,11 @@ def is_unique(self): @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, take_last=False): - from pandas.core.groupby import get_flat_ids + from pandas.core.groupby import get_group_index from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) - ids = get_flat_ids(self.labels, shape, False) + ids = get_group_index(self.labels, shape, sort=False, xnull=False) return duplicated_int64(ids, take_last) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 18dab471e3de2..ba227f4e3d3d1 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -12,8 +12,8 @@ from pandas.core.categorical import Categorical from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, isnull) -from pandas.core.groupby import (get_group_index, _compress_group_index, - decons_group_index) +from pandas.core.groupby import get_group_index, _compress_group_index + import pandas.core.common as com import pandas.algos as algos @@ -103,10 +103,6 @@ def _make_sorted_values_labels(self): sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) - - # group_index = get_group_index(to_sort, sizes) - # comp_index, obs_ids = _compress_group_index(group_index) - ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] @@ -252,6 +248,8 @@ def _make_new_index(lev, lab): def _unstack_multiple(data, clocs): + from pandas.core.groupby import decons_obs_group_ids + if len(clocs) == 0: return data @@ -271,10 +269,10 @@ def _unstack_multiple(data, clocs): rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] - group_index = get_group_index(clabels, shape) + group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) - recons_labels = decons_group_index(obs_ids, shape) + recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], @@ -449,9 +447,9 @@ def _unstack_frame(obj, level): def get_compressed_ids(labels, sizes): - from pandas.core.groupby import get_flat_ids + from pandas.core.groupby import get_group_index - ids = get_flat_ids(labels, sizes, True) + ids = get_group_index(labels, sizes, sort=True, xnull=False) return _compress_group_index(ids, sort=True) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 26fba1a4b9615..8bdcfb44242ff 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -17,6 +17,7 @@ cnp.import_array() cnp.import_ufunc() cdef int64_t iNaT = util.get_nat() +_SIZE_HINT_LIMIT = (1 << 20) + 7 cdef extern from "datetime.h": bint PyDateTime_Check(object o) @@ -1073,7 +1074,7 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last): kh_int64_t * table = kh_init_int64() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_int64(table, min(1 << 20, n)) + kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) if take_last: for i from n > i >=0: @@ -1086,3 +1087,33 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last): kh_destroy_int64(table) return out + + +@cython.wraparound(False) +@cython.boundscheck(False) +def unique_label_indices(ndarray[int64_t, ndim=1] labels): + """ + indices of the first occurrences of the unique labels + *excluding* -1. equivelent to: + np.unique(labels, return_index=True)[1] + """ + cdef: + int ret = 0 + Py_ssize_t i, n = len(labels) + kh_int64_t * table = kh_init_int64() + Int64Vector idx = Int64Vector() + ndarray[int64_t, ndim=1] arr + + kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + + for i in range(n): + kh_put_int64(table, labels[i], &ret) + if ret != 0: + idx.append(i) + + kh_destroy_int64(table) + + arr = idx.to_array() + arr = arr[labels[arr].argsort()] + + return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b145400afe13b..b87e64c0d575c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -261,6 +261,21 @@ def test_quantile(): expected = algos.quantile(s.values, [0, .25, .5, .75, 1.]) tm.assert_almost_equal(result, expected) +def test_unique_label_indices(): + from pandas.hashtable import unique_label_indices + + a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') + + left = unique_label_indices(a) + right = np.unique(a, return_index=True)[1] + + tm.assert_array_equal(left, right) + + a[np.random.choice(len(a), 10)] = -1 + left= unique_label_indices(a) + right = np.unique(a, return_index=True)[1][1:] + tm.assert_array_equal(left, right) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4077f468d8b1f..d1ab33e607f4d 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2880,6 +2880,8 @@ def test_int32_overflow(self): self.assertEqual(len(left), len(right)) def test_int64_overflow(self): + from pandas.core.groupby import _int64_overflow_possible + B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) @@ -2911,6 +2913,45 @@ def test_int64_overflow(self): self.assertEqual(left[k], v) self.assertEqual(len(left), len(right)) + # GH9096 + values = range(55109) + data = pd.DataFrame.from_dict({'a': values, 'b': values, + 'c': values, 'd': values}) + grouped = data.groupby(['a', 'b', 'c', 'd']) + self.assertEqual(len(grouped), len(values)) + + arr = np.random.randint(- 1 << 12, 1 << 12, (1 << 15, 5)) + i = np.random.choice(len(arr), len(arr) * 4) + arr = np.vstack((arr, arr[i])) # add sume duplicate rows + + i = np.random.permutation(len(arr)) + arr = arr[i] # shuffle rows + + df = DataFrame(arr, columns=list('abcde')) + df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list('abcde')) + + # verify this is testing what it is supposed to test! + self.assertTrue(_int64_overflow_possible(gr.grouper.shape)) + + # mannually compute groupings + jim, joe = defaultdict(list), defaultdict(list) + for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): + jim[key].append(a) + joe[key].append(b) + + self.assertEqual(len(gr), len(jim)) + mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) + + def aggr(func): + f = lambda a: np.fromiter(map(func, a), dtype='f8') + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=['jim', 'joe'], index=mi) + return res.sort_index() + + assert_frame_equal(gr.mean(), aggr(np.mean)) + assert_frame_equal(gr.median(), aggr(np.median)) + def test_groupby_sort_multi(self): df = DataFrame({'a': ['foo', 'bar', 'baz'], 'b': [3, 2, 1], @@ -4942,7 +4983,7 @@ def test_decons(): from pandas.core.groupby import decons_group_index, get_group_index def testit(label_list, shape): - group_index = get_group_index(label_list, shape) + group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 27a442ece1281..fd18c81a7d00d 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -485,6 +485,22 @@ def f(g): groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup) groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup) + +setup = common_setup + ''' +arr = np.random.randint(- 1 << 12, 1 << 12, (1 << 17, 5)) +i = np.random.choice(len(arr), len(arr) * 5) +arr = np.vstack((arr, arr[i])) # add sume duplicate rows + +i = np.random.permutation(len(arr)) +arr = arr[i] # shuffle rows + +df = DataFrame(arr, columns=list('abcde')) +df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 +''' + +groupby_int64_overflow = Benchmark("df.groupby(list('abcde')).max()", setup, + name='groupby_int64_overflow') + #---------------------------------------------------------------------- # groupby with a variable value for ngroups