Skip to content

Commit

Permalink
BUG: Series contains NaT with object dtype comparison incorrect (pand…
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks authored and nateGeorge committed Aug 15, 2016
1 parent a07b5d3 commit ff2a335
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 44 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ Bug Fixes
- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`)

- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`)
- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`)
- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`)
- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`)
- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`)
- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`)
Expand Down
35 changes: 21 additions & 14 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
is_integer_dtype, is_categorical_dtype,
is_object_dtype, is_timedelta64_dtype,
is_datetime64_dtype, is_datetime64tz_dtype,
is_bool_dtype, PerformanceWarning, ABCSeries)
is_bool_dtype, PerformanceWarning,
ABCSeries, ABCIndex)

# -----------------------------------------------------------------------------
# Functions that add arithmetic methods to objects, given arithmetic factory
Expand Down Expand Up @@ -664,6 +665,22 @@ def wrapper(left, right, name=name, na_op=na_op):
return wrapper


def _comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, list):
y = lib.list_to_object_array(y)
if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
if not is_object_dtype(y.dtype):
y = y.astype(np.object_)

if isinstance(y, (ABCSeries, ABCIndex)):
y = y.values

result = lib.vec_compare(x, y, op)
else:
result = lib.scalar_compare(x, y, op)
return result


def _comp_method_SERIES(op, name, str_rep, masker=False):
"""
Wrapper function for Series arithmetic operations, to avoid
Expand All @@ -680,16 +697,7 @@ def na_op(x, y):
return op(y, x)

if is_object_dtype(x.dtype):
if isinstance(y, list):
y = lib.list_to_object_array(y)

if isinstance(y, (np.ndarray, ABCSeries)):
if not is_object_dtype(y.dtype):
result = lib.vec_compare(x, y.astype(np.object_), op)
else:
result = lib.vec_compare(x, y, op)
else:
result = lib.scalar_compare(x, y, op)
result = _comp_method_OBJECT_ARRAY(op, x, y)
else:

# we want to compare like types
Expand All @@ -713,12 +721,11 @@ def na_op(x, y):
(not isscalar(y) and needs_i8_conversion(y))):

if isscalar(y):
mask = isnull(x)
y = _index.convert_scalar(x, _values_from_object(y))
else:
mask = isnull(x) | isnull(y)
y = y.view('i8')

mask = isnull(x)

x = x.view('i8')

try:
Expand Down
20 changes: 12 additions & 8 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
is_list_like, is_bool_dtype,
is_integer_dtype, is_float_dtype,
needs_i8_conversion)
from pandas.core.ops import _comp_method_OBJECT_ARRAY
from pandas.core.strings import StringAccessorMixin

from pandas.core.config import get_option
Expand Down Expand Up @@ -3182,8 +3183,11 @@ def _evaluate_compare(self, other):
if needs_i8_conversion(self) and needs_i8_conversion(other):
return self._evaluate_compare(other, op)

func = getattr(self.values, op)
result = func(np.asarray(other))
if is_object_dtype(self) and self.nlevels == 1:
# don't pass MultiIndex
result = _comp_method_OBJECT_ARRAY(op, self.values, other)
else:
result = op(self.values, np.asarray(other))

# technically we could support bool dtyped Index
# for now just return the indexing array directly
Expand All @@ -3196,12 +3200,12 @@ def _evaluate_compare(self, other):

return _evaluate_compare

cls.__eq__ = _make_compare('__eq__')
cls.__ne__ = _make_compare('__ne__')
cls.__lt__ = _make_compare('__lt__')
cls.__gt__ = _make_compare('__gt__')
cls.__le__ = _make_compare('__le__')
cls.__ge__ = _make_compare('__ge__')
cls.__eq__ = _make_compare(operator.eq)
cls.__ne__ = _make_compare(operator.ne)
cls.__lt__ = _make_compare(operator.lt)
cls.__gt__ = _make_compare(operator.gt)
cls.__le__ = _make_compare(operator.le)
cls.__ge__ = _make_compare(operator.ge)

@classmethod
def _add_numericlike_set_methods_disabled(cls):
Expand Down
12 changes: 6 additions & 6 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -768,12 +768,12 @@ def scalar_compare(ndarray[object] values, object val, object op):
raise ValueError('Unrecognized operator')

result = np.empty(n, dtype=bool).view(np.uint8)
isnull_val = _checknull(val)
isnull_val = checknull(val)

if flag == cpython.Py_NE:
for i in range(n):
x = values[i]
if _checknull(x):
if checknull(x):
result[i] = True
elif isnull_val:
result[i] = True
Expand All @@ -785,7 +785,7 @@ def scalar_compare(ndarray[object] values, object val, object op):
elif flag == cpython.Py_EQ:
for i in range(n):
x = values[i]
if _checknull(x):
if checknull(x):
result[i] = False
elif isnull_val:
result[i] = False
Expand All @@ -798,7 +798,7 @@ def scalar_compare(ndarray[object] values, object val, object op):
else:
for i in range(n):
x = values[i]
if _checknull(x):
if checknull(x):
result[i] = False
elif isnull_val:
result[i] = False
Expand Down Expand Up @@ -864,7 +864,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op):
x = left[i]
y = right[i]

if _checknull(x) or _checknull(y):
if checknull(x) or checknull(y):
result[i] = True
else:
result[i] = cpython.PyObject_RichCompareBool(x, y, flag)
Expand All @@ -873,7 +873,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op):
x = left[i]
y = right[i]

if _checknull(x) or _checknull(y):
if checknull(x) or checknull(y):
result[i] = False
else:
result[i] = cpython.PyObject_RichCompareBool(x, y, flag)
Expand Down
101 changes: 87 additions & 14 deletions pandas/tests/series/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,24 +980,97 @@ def test_comparison_invalid(self):
self.assertRaises(TypeError, lambda: x <= y)

def test_more_na_comparisons(self):
left = Series(['a', np.nan, 'c'])
right = Series(['a', np.nan, 'd'])
for dtype in [None, object]:
left = Series(['a', np.nan, 'c'], dtype=dtype)
right = Series(['a', np.nan, 'd'], dtype=dtype)

result = left == right
expected = Series([True, False, False])
assert_series_equal(result, expected)
result = left == right
expected = Series([True, False, False])
assert_series_equal(result, expected)

result = left != right
expected = Series([False, True, True])
assert_series_equal(result, expected)
result = left != right
expected = Series([False, True, True])
assert_series_equal(result, expected)

result = left == np.nan
expected = Series([False, False, False])
assert_series_equal(result, expected)
result = left == np.nan
expected = Series([False, False, False])
assert_series_equal(result, expected)

result = left != np.nan
expected = Series([True, True, True])
assert_series_equal(result, expected)
result = left != np.nan
expected = Series([True, True, True])
assert_series_equal(result, expected)

def test_nat_comparisons(self):
data = [([pd.Timestamp('2011-01-01'), pd.NaT,
pd.Timestamp('2011-01-03')],
[pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]),

([pd.Timedelta('1 days'), pd.NaT,
pd.Timedelta('3 days')],
[pd.NaT, pd.NaT, pd.Timedelta('3 days')]),

([pd.Period('2011-01', freq='M'), pd.NaT,
pd.Period('2011-03', freq='M')],
[pd.NaT, pd.NaT, pd.Period('2011-03', freq='M')])]

# add lhs / rhs switched data
data = data + [(r, l) for l, r in data]

for l, r in data:
for dtype in [None, object]:
left = Series(l, dtype=dtype)

# Series, Index
for right in [Series(r, dtype=dtype), Index(r, dtype=dtype)]:
expected = Series([False, False, True])
assert_series_equal(left == right, expected)

expected = Series([True, True, False])
assert_series_equal(left != right, expected)

expected = Series([False, False, False])
assert_series_equal(left < right, expected)

expected = Series([False, False, False])
assert_series_equal(left > right, expected)

expected = Series([False, False, True])
assert_series_equal(left >= right, expected)

expected = Series([False, False, True])
assert_series_equal(left <= right, expected)

def test_nat_comparisons_scalar(self):
data = [[pd.Timestamp('2011-01-01'), pd.NaT,
pd.Timestamp('2011-01-03')],

[pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')],

[pd.Period('2011-01', freq='M'), pd.NaT,
pd.Period('2011-03', freq='M')]]

for l in data:
for dtype in [None, object]:
left = Series(l, dtype=dtype)

expected = Series([False, False, False])
assert_series_equal(left == pd.NaT, expected)
assert_series_equal(pd.NaT == left, expected)

expected = Series([True, True, True])
assert_series_equal(left != pd.NaT, expected)
assert_series_equal(pd.NaT != left, expected)

expected = Series([False, False, False])
assert_series_equal(left < pd.NaT, expected)
assert_series_equal(pd.NaT > left, expected)
assert_series_equal(left <= pd.NaT, expected)
assert_series_equal(pd.NaT >= left, expected)

assert_series_equal(left > pd.NaT, expected)
assert_series_equal(pd.NaT < left, expected)
assert_series_equal(left >= pd.NaT, expected)
assert_series_equal(pd.NaT <= left, expected)

def test_comparison_different_length(self):
a = Series(['a', 'b', 'c'])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def _evaluate_compare(self, other, op):
other = type(self)(other)

# compare
result = getattr(self.asi8, op)(other.asi8)
result = op(self.asi8, other.asi8)

# technically we could support bool dtyped Index
# for now just return the indexing array directly
Expand Down
2 changes: 1 addition & 1 deletion pandas/tseries/tdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _td_index_cmp(opname, nat_result=False):

def wrapper(self, other):
func = getattr(super(TimedeltaIndex, self), opname)
if _is_convertible_to_td(other):
if _is_convertible_to_td(other) or other is tslib.NaT:
other = _to_m8(other)
result = func(other)
if com.isnull(other):
Expand Down
78 changes: 78 additions & 0 deletions pandas/tseries/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,32 @@ def test_sub_period(self):
with tm.assertRaises(TypeError):
p - idx

def test_comp_nat(self):
left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT,
pd.Timestamp('2011-01-03')])
right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')])

for l, r in [(left, right), (left.asobject, right.asobject)]:
result = l == r
expected = np.array([False, False, True])
tm.assert_numpy_array_equal(result, expected)

result = l != r
expected = np.array([True, True, False])
tm.assert_numpy_array_equal(result, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l == pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT == r, expected)

expected = np.array([True, True, True])
tm.assert_numpy_array_equal(l != pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT != l, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l < pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT > l, expected)

def test_value_counts_unique(self):
# GH 7735
for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']:
Expand Down Expand Up @@ -1238,6 +1264,32 @@ def test_addition_ops(self):
expected = Timestamp('20130102')
self.assertEqual(result, expected)

def test_comp_nat(self):
left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT,
pd.Timedelta('3 days')])
right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')])

for l, r in [(left, right), (left.asobject, right.asobject)]:
result = l == r
expected = np.array([False, False, True])
tm.assert_numpy_array_equal(result, expected)

result = l != r
expected = np.array([True, True, False])
tm.assert_numpy_array_equal(result, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l == pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT == r, expected)

expected = np.array([True, True, True])
tm.assert_numpy_array_equal(l != pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT != l, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l < pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT > l, expected)

def test_value_counts_unique(self):
# GH 7735

Expand Down Expand Up @@ -2039,6 +2091,32 @@ def test_sub_isub(self):
rng -= 1
tm.assert_index_equal(rng, expected)

def test_comp_nat(self):
left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT,
pd.Period('2011-01-03')])
right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')])

for l, r in [(left, right), (left.asobject, right.asobject)]:
result = l == r
expected = np.array([False, False, True])
tm.assert_numpy_array_equal(result, expected)

result = l != r
expected = np.array([True, True, False])
tm.assert_numpy_array_equal(result, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l == pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT == r, expected)

expected = np.array([True, True, True])
tm.assert_numpy_array_equal(l != pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT != l, expected)

expected = np.array([False, False, False])
tm.assert_numpy_array_equal(l < pd.NaT, expected)
tm.assert_numpy_array_equal(pd.NaT > l, expected)

def test_value_counts_unique(self):
# GH 7735
idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
Expand Down

0 comments on commit ff2a335

Please sign in to comment.