Skip to content

Commit

Permalink
BUG, TST: Check uint64 behaviour in algorithms.py
Browse files Browse the repository at this point in the history
First of a series of PR's to patch and test `uint64` behaviour in
`core/algorithms.py`.  In this PR, the following functions are
checked:    1. `duplicated()` : robust but now has test to confirm  2.
`mode()` : robust but now has test to confirm  3. `unique()` : non-
robust but patched and tested

Author: gfyoung <gfyoung17@gmail.com>

Closes #14934 from gfyoung/core-algorithms-uint64 and squashes the following commits:

6d31057 [gfyoung] DOC, TST, BUG: Improve uint64 core/algos behavior
  • Loading branch information
gfyoung authored and jreback committed Dec 23, 2016
1 parent 74de478 commit 5710947
Show file tree
Hide file tree
Showing 9 changed files with 457 additions and 237 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -298,5 +298,6 @@ Bug Fixes


- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
6 changes: 3 additions & 3 deletions pandas/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
'is_object_dtype', 'is_scalar', 'is_sparse',
'is_string_dtype',
'is_string_dtype', 'is_signed_integer_dtype',
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
'is_period', 'is_period_dtype',
'is_re', 'is_re_compilable',
'is_unsigned_integer_dtype', 'is_period',
'is_period_dtype', 'is_re', 'is_re_compilable',
'is_dict_like', 'is_iterator',
'is_list_like', 'is_hashable',
'is_named_tuple', 'is_sequence',
Expand Down
50 changes: 36 additions & 14 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from pandas import compat, lib, tslib, _np_version_under1p8
from pandas.types.cast import _maybe_promote
from pandas.types.generic import ABCSeries, ABCIndex
from pandas.types.common import (is_integer_dtype,
from pandas.types.common import (is_unsigned_integer_dtype,
is_signed_integer_dtype,
is_integer_dtype,
is_int64_dtype,
is_categorical_dtype,
is_extension_type,
Expand Down Expand Up @@ -479,8 +481,9 @@ def _value_counts_arraylike(values, dropna=True):
keys, counts = htable.value_count_float64(values, dropna)
else:
values = _ensure_object(values)
keys, counts = htable.value_count_object(values, dropna)

mask = isnull(values)
keys, counts = htable.value_count_object(values, mask)
if not dropna and mask.any():
keys = np.insert(keys, 0, np.NaN)
counts = np.insert(counts, 0, mask.sum())
Expand All @@ -490,12 +493,14 @@ def _value_counts_arraylike(values, dropna=True):

def duplicated(values, keep='first'):
"""
Return boolean ndarray denoting duplicate values
Return boolean ndarray denoting duplicate values.
.. versionadded:: 0.19.0
Parameters
----------
values : ndarray-like
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
Expand All @@ -521,9 +526,12 @@ def duplicated(values, keep='first'):
elif isinstance(values, (ABCSeries, ABCIndex)):
values = values.values

if is_integer_dtype(dtype):
if is_signed_integer_dtype(dtype):
values = _ensure_int64(values)
duplicated = htable.duplicated_int64(values, keep=keep)
elif is_unsigned_integer_dtype(dtype):
values = _ensure_uint64(values)
duplicated = htable.duplicated_uint64(values, keep=keep)
elif is_float_dtype(dtype):
values = _ensure_float64(values)
duplicated = htable.duplicated_float64(values, keep=keep)
Expand All @@ -535,7 +543,19 @@ def duplicated(values, keep='first'):


def mode(values):
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
"""
Returns the mode(s) of an array.
Parameters
----------
values : array-like
Array over which to check for duplicate values.
Returns
-------
mode : Series
"""

# must sort because hash order isn't necessarily defined.
from pandas.core.series import Series

Expand All @@ -547,23 +567,23 @@ def mode(values):
constructor = Series

dtype = values.dtype
if is_integer_dtype(values):
if is_signed_integer_dtype(values):
values = _ensure_int64(values)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
elif is_unsigned_integer_dtype(values):
values = _ensure_uint64(values)
result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
elif is_categorical_dtype(values):
result = constructor(values.mode())
else:
mask = isnull(values)
values = _ensure_object(values)
res = htable.mode_object(values, mask)
res = htable.mode_object(values)
try:
res = sorted(res)
res = np.sort(res)
except TypeError as e:
warn("Unable to sort modes: %s" % e)
result = constructor(res, dtype=dtype)
Expand Down Expand Up @@ -893,8 +913,10 @@ def _hashtable_algo(f, values, return_dtype=None):
dtype = values.dtype
if is_float_dtype(dtype):
return f(htable.Float64HashTable, _ensure_float64)
elif is_integer_dtype(dtype):
elif is_signed_integer_dtype(dtype):
return f(htable.Int64HashTable, _ensure_int64)
elif is_unsigned_integer_dtype(dtype):
return f(htable.UInt64HashTable, _ensure_uint64)
elif is_datetime64_dtype(dtype):
return_dtype = return_dtype or 'M8[ns]'
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
Expand Down
161 changes: 2 additions & 159 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ cdef extern from "numpy/npy_math.h":
cimport cython
cimport numpy as cnp

from pandas.lib import checknull

cnp.import_array()
cnp.import_ufunc()

Expand Down Expand Up @@ -117,165 +119,6 @@ cdef class Int64Factorizer:
return labels


@cython.wraparound(False)
@cython.boundscheck(False)
cdef build_count_table_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask,
kh_pymap_t *table):
cdef:
khiter_t k
Py_ssize_t i, n = len(values)
int ret = 0

kh_resize_pymap(table, n // 10)

for i in range(n):
if mask[i]:
continue

val = values[i]
k = kh_get_pymap(table, <PyObject*> val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_pymap(table, <PyObject*> val, &ret)
table.vals[k] = 1


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef value_count_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask):
cdef:
Py_ssize_t i
kh_pymap_t *table
int k

table = kh_init_pymap()
build_count_table_object(values, mask, table)

i = 0
result_keys = np.empty(table.n_occupied, dtype=object)
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
result_keys[i] = <object> table.keys[k]
result_counts[i] = table.vals[k]
i += 1
kh_destroy_pymap(table)

return result_keys, result_counts


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
ndarray[object] modes
kh_pymap_t *table

table = kh_init_pymap()
build_count_table_object(values, mask, table)

modes = np.empty(table.n_buckets, dtype=np.object_)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = <object> table.keys[k]

kh_destroy_pymap(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_int64(int64_t[:] values):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
kh_int64_t *table
ndarray[int64_t] modes

table = kh_init_int64()

build_count_table_int64(values, table, 0)

modes = np.empty(table.n_buckets, dtype=np.int64)

with nogil:
for k in range(table.n_buckets):
if kh_exist_int64(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = table.keys[k]

kh_destroy_int64(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_object(ndarray[object] values, object keep='first'):
cdef:
Py_ssize_t i, n
dict seen = dict()
object row

n = len(values)
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)

if keep == 'last':
for i from n > i >= 0:
row = values[i]
if row in seen:
result[i] = 1
else:
seen[row] = i
result[i] = 0
elif keep == 'first':
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
else:
seen[row] = i
result[i] = 0
elif keep is False:
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
result[seen[row]] = 1
else:
seen[row] = i
result[i] = 0
else:
raise ValueError('keep must be either "first", "last" or False')

return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(ndarray[int64_t, ndim=1] labels):
Expand Down
Loading

0 comments on commit 5710947

Please sign in to comment.