BUG, TST: Check uint64 behaviour in algorithms.py

First of a series of PR's to patch and test `uint64` behaviour in `core/algorithms.py`. In this PR, the following functions are checked: 1. `duplicated()` : robust but now has test to confirm 2. `mode()` : robust but now has test to confirm 3. `unique()` : non- robust but patched and tested Author: gfyoung <gfyoung17@gmail.com> Closes #14934 from gfyoung/core-algorithms-uint64 and squashes the following commits: 6d31057 [gfyoung] DOC, TST, BUG: Improve uint64 core/algos behavior
pandas-dev · Dec 23, 2016 · 5710947 · 5710947
1 parent 74de478
commit 5710947
Show file tree

Hide file tree

Showing 9 changed files with 457 additions and 237 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -298,5 +298,6 @@ Bug Fixes
 
 
 - Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
+- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
 - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
@@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
                'is_floating_dtype', 'is_int64_dtype', 'is_integer',
                'is_integer_dtype', 'is_number', 'is_numeric_dtype',
                'is_object_dtype', 'is_scalar', 'is_sparse',
-               'is_string_dtype',
+               'is_string_dtype', 'is_signed_integer_dtype',
                'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
-               'is_period', 'is_period_dtype',
-               'is_re', 'is_re_compilable',
+               'is_unsigned_integer_dtype', 'is_period',
+               'is_period_dtype', 'is_re', 'is_re_compilable',
                'is_dict_like', 'is_iterator',
                'is_list_like', 'is_hashable',
                'is_named_tuple', 'is_sequence',

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -9,7 +9,9 @@
 from pandas import compat, lib, tslib, _np_version_under1p8
 from pandas.types.cast import _maybe_promote
 from pandas.types.generic import ABCSeries, ABCIndex
-from pandas.types.common import (is_integer_dtype,
+from pandas.types.common import (is_unsigned_integer_dtype,
+                                 is_signed_integer_dtype,
+                                 is_integer_dtype,
                                  is_int64_dtype,
                                  is_categorical_dtype,
                                  is_extension_type,
@@ -479,8 +481,9 @@ def _value_counts_arraylike(values, dropna=True):
         keys, counts = htable.value_count_float64(values, dropna)
     else:
         values = _ensure_object(values)
+        keys, counts = htable.value_count_object(values, dropna)
+
         mask = isnull(values)
-        keys, counts = htable.value_count_object(values, mask)
         if not dropna and mask.any():
             keys = np.insert(keys, 0, np.NaN)
             counts = np.insert(counts, 0, mask.sum())
@@ -490,12 +493,14 @@ def _value_counts_arraylike(values, dropna=True):
 
 def duplicated(values, keep='first'):
     """
-    Return boolean ndarray denoting duplicate values
+    Return boolean ndarray denoting duplicate values.
 
     .. versionadded:: 0.19.0
 
     Parameters
     ----------
+    values : ndarray-like
+        Array over which to check for duplicate values.
     keep : {'first', 'last', False}, default 'first'
         - ``first`` : Mark duplicates as ``True`` except for the first
           occurrence.
@@ -521,9 +526,12 @@ def duplicated(values, keep='first'):
     elif isinstance(values, (ABCSeries, ABCIndex)):
         values = values.values
 
-    if is_integer_dtype(dtype):
+    if is_signed_integer_dtype(dtype):
         values = _ensure_int64(values)
         duplicated = htable.duplicated_int64(values, keep=keep)
+    elif is_unsigned_integer_dtype(dtype):
+        values = _ensure_uint64(values)
+        duplicated = htable.duplicated_uint64(values, keep=keep)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
         duplicated = htable.duplicated_float64(values, keep=keep)
@@ -535,7 +543,19 @@ def duplicated(values, keep='first'):
 
 
 def mode(values):
-    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
+    """
+    Returns the mode(s) of an array.
+
+    Parameters
+    ----------
+    values : array-like
+        Array over which to check for duplicate values.
+
+    Returns
+    -------
+    mode : Series
+    """
+
     # must sort because hash order isn't necessarily defined.
     from pandas.core.series import Series
 
@@ -547,23 +567,23 @@ def mode(values):
         constructor = Series
 
     dtype = values.dtype
-    if is_integer_dtype(values):
+    if is_signed_integer_dtype(values):
         values = _ensure_int64(values)
-        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
-
+        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
+    elif is_unsigned_integer_dtype(values):
+        values = _ensure_uint64(values)
+        result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
     elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
         dtype = values.dtype
         values = values.view(np.int64)
-        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
-
+        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
     elif is_categorical_dtype(values):
         result = constructor(values.mode())
     else:
-        mask = isnull(values)
         values = _ensure_object(values)
-        res = htable.mode_object(values, mask)
+        res = htable.mode_object(values)
         try:
-            res = sorted(res)
+            res = np.sort(res)
         except TypeError as e:
             warn("Unable to sort modes: %s" % e)
         result = constructor(res, dtype=dtype)
@@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None):
     dtype = values.dtype
     if is_float_dtype(dtype):
         return f(htable.Float64HashTable, _ensure_float64)
-    elif is_integer_dtype(dtype):
+    elif is_signed_integer_dtype(dtype):
         return f(htable.Int64HashTable, _ensure_int64)
+    elif is_unsigned_integer_dtype(dtype):
+        return f(htable.UInt64HashTable, _ensure_uint64)
     elif is_datetime64_dtype(dtype):
         return_dtype = return_dtype or 'M8[ns]'
         return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -22,6 +22,8 @@ cdef extern from "numpy/npy_math.h":
 cimport cython
 cimport numpy as cnp
 
+from pandas.lib import checknull
+
 cnp.import_array()
 cnp.import_ufunc()
 
@@ -117,165 +119,6 @@ cdef class Int64Factorizer:
         return labels
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef build_count_table_object(ndarray[object] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              kh_pymap_t *table):
-    cdef:
-        khiter_t k
-        Py_ssize_t i, n = len(values)
-        int ret = 0
-
-    kh_resize_pymap(table, n // 10)
-
-    for i in range(n):
-        if mask[i]:
-            continue
-
-        val = values[i]
-        k = kh_get_pymap(table, <PyObject*> val)
-        if k != table.n_buckets:
-            table.vals[k] += 1
-        else:
-            k = kh_put_pymap(table, <PyObject*> val, &ret)
-            table.vals[k] = 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef value_count_object(ndarray[object] values,
-                         ndarray[uint8_t, cast=True] mask):
-    cdef:
-        Py_ssize_t i
-        kh_pymap_t *table
-        int k
-
-    table = kh_init_pymap()
-    build_count_table_object(values, mask, table)
-
-    i = 0
-    result_keys = np.empty(table.n_occupied, dtype=object)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
-    for k in range(table.n_buckets):
-        if kh_exist_pymap(table, k):
-            result_keys[i] = <object> table.keys[k]
-            result_counts[i] = table.vals[k]
-            i += 1
-    kh_destroy_pymap(table)
-
-    return result_keys, result_counts
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
-    cdef:
-        int count, max_count = 2
-        int j = -1 # so you can do +=
-        int k
-        ndarray[object] modes
-        kh_pymap_t *table
-
-    table = kh_init_pymap()
-    build_count_table_object(values, mask, table)
-
-    modes = np.empty(table.n_buckets, dtype=np.object_)
-    for k in range(table.n_buckets):
-        if kh_exist_pymap(table, k):
-            count = table.vals[k]
-
-            if count == max_count:
-                j += 1
-            elif count > max_count:
-                max_count = count
-                j = 0
-            else:
-                continue
-            modes[j] = <object> table.keys[k]
-
-    kh_destroy_pymap(table)
-
-    return modes[:j + 1]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def mode_int64(int64_t[:] values):
-    cdef:
-        int count, max_count = 2
-        int j = -1 # so you can do +=
-        int k
-        kh_int64_t *table
-        ndarray[int64_t] modes
-
-    table = kh_init_int64()
-
-    build_count_table_int64(values, table, 0)
-
-    modes = np.empty(table.n_buckets, dtype=np.int64)
-
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                count = table.vals[k]
-
-                if count == max_count:
-                    j += 1
-                elif count > max_count:
-                    max_count = count
-                    j = 0
-                else:
-                    continue
-                modes[j] = table.keys[k]
-
-    kh_destroy_int64(table)
-
-    return modes[:j + 1]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def duplicated_object(ndarray[object] values, object keep='first'):
-    cdef:
-        Py_ssize_t i, n
-        dict seen = dict()
-        object row
-
-    n = len(values)
-    cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
-
-    if keep == 'last':
-        for i from n > i >= 0:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep == 'first':
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep is False:
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-                result[seen[row]] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    else:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return result.view(np.bool_)
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def unique_label_indices(ndarray[int64_t, ndim=1] labels):