Skip to content

Commit

Permalink
BUG: Fix bound checking for Timestamp() with dt64 #4065
Browse files Browse the repository at this point in the history
To fix the bug, this change adds bounds checking to
_get_datetime64_nanos() for numpy datetimes that aren't already in [ns]
units.

Additionally, it updates _check_dts_bounds() to do the bound check just
based off the pandas_datetimestruct, by comparing to the minimum and
maximum valid pandas_datetimestructs for datetime64[ns].  It is simpler
and more accurate than the previous system.

Also includes a number of small refactors/fixes to deal with new error
cases that didn't exist when invalid datetime64s were just silently
coerced into the valid datetime64[ns] range.
  • Loading branch information
danbirken committed Oct 4, 2013
1 parent 3722487 commit 6d6f392
Show file tree
Hide file tree
Showing 9 changed files with 340 additions and 69 deletions.
2 changes: 1 addition & 1 deletion doc/source/release.rst
Expand Up @@ -555,7 +555,7 @@ Bug Fixes
type of headers (:issue:`5048`).
- Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a
stack overflow (:issue:`3899`).

- Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`)

pandas 0.12.0
-------------
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/common.py
Expand Up @@ -348,6 +348,13 @@ def _pickle_array(arr):

def _unpickle_array(bytes):
arr = read_array(BytesIO(bytes))

# All datetimes should be stored as M8[ns]. When unpickling with
# numpy1.6, it will read these as M8[us]. So this ensures all
# datetime64 types are read as MS[ns]
if is_datetime64_dtype(arr):
arr = arr.view(_NS_DTYPE)

return arr


Expand Down Expand Up @@ -1780,6 +1787,14 @@ def is_datetime64_dtype(arr_or_dtype):
tipo = arr_or_dtype.dtype.type
return issubclass(tipo, np.datetime64)

def is_datetime64_ns_dtype(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
tipo = arr_or_dtype
elif isinstance(arr_or_dtype, type):
tipo = np.dtype(arr_or_dtype)
else:
tipo = arr_or_dtype.dtype
return tipo == _NS_DTYPE

def is_timedelta64_dtype(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
Expand Down
3 changes: 3 additions & 0 deletions pandas/src/datetime.pxd
Expand Up @@ -85,6 +85,9 @@ cdef extern from "datetime/np_datetime.h":
npy_int64 year
npy_int32 month, day, hour, min, sec, us, ps, as

int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
pandas_datetimestruct *b)

int convert_pydatetime_to_datetimestruct(PyObject *obj,
pandas_datetimestruct *out,
PANDAS_DATETIMEUNIT *out_bestunit,
Expand Down
63 changes: 63 additions & 0 deletions pandas/src/datetime/np_datetime.c
Expand Up @@ -273,6 +273,69 @@ set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts)
}
}

/*
* Compares two pandas_datetimestruct objects chronologically
*/
int
cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b)
{
if (a->year > b->year) {
return 1;
} else if (a->year < b->year) {
return -1;
}

if (a->month > b->month) {
return 1;
} else if (a->month < b->month) {
return -1;
}

if (a->day > b->day) {
return 1;
} else if (a->day < b->day) {
return -1;
}

if (a->hour > b->hour) {
return 1;
} else if (a->hour < b->hour) {
return -1;
}

if (a->min > b->min) {
return 1;
} else if (a->min < b->min) {
return -1;
}

if (a->sec > b->sec) {
return 1;
} else if (a->sec < b->sec) {
return -1;
}

if (a->us > b->us) {
return 1;
} else if (a->us < b->us) {
return -1;
}

if (a->ps > b->ps) {
return 1;
} else if (a->ps < b->ps) {
return -1;
}

if (a->as > b->as) {
return 1;
} else if (a->as < b->as) {
return -1;
}

return 0;
}

/*
*
* Tests for and converts a Python datetime.datetime or datetime.date
Expand Down
14 changes: 7 additions & 7 deletions pandas/tseries/index.py
Expand Up @@ -204,7 +204,7 @@ def __new__(cls, data=None,
data = _str_to_dt_array(data, offset, dayfirst=dayfirst,
yearfirst=yearfirst)
else:
data = tools.to_datetime(data)
data = tools.to_datetime(data, errors='raise')
data.offset = offset
if isinstance(data, DatetimeIndex):
if name is not None:
Expand Down Expand Up @@ -243,14 +243,14 @@ def __new__(cls, data=None,
subarr = data.view(_NS_DTYPE)
else:
try:
subarr = tools.to_datetime(data)
subarr = tools.to_datetime(data, box=False)
except ValueError:
# tz aware
subarr = tools.to_datetime(data, utc=True)
subarr = tools.to_datetime(data, box=False, utc=True)

if not np.issubdtype(subarr.dtype, np.datetime64):
raise TypeError('Unable to convert %s to datetime dtype'
% str(data))
raise ValueError('Unable to convert %s to datetime dtype'
% str(data))

if isinstance(subarr, DatetimeIndex):
if tz is None:
Expand Down Expand Up @@ -934,7 +934,7 @@ def join(self, other, how='left', level=None, return_indexers=False):
'mixed-integer-float', 'mixed')):
try:
other = DatetimeIndex(other)
except TypeError:
except (TypeError, ValueError):
pass

this, other = self._maybe_utc_convert(other)
Expand Down Expand Up @@ -1051,7 +1051,7 @@ def intersection(self, other):
if not isinstance(other, DatetimeIndex):
try:
other = DatetimeIndex(other)
except TypeError:
except (TypeError, ValueError):
pass
result = Index.intersection(self, other)
if isinstance(result, DatetimeIndex):
Expand Down
77 changes: 76 additions & 1 deletion pandas/tseries/tests/test_timeseries.py
@@ -1,5 +1,5 @@
# pylint: disable-msg=E1101,W0612
from datetime import datetime, time, timedelta
from datetime import datetime, time, timedelta, date
import sys
import os
import unittest
Expand Down Expand Up @@ -952,6 +952,81 @@ def test_to_datetime_list_of_integers(self):

self.assert_(rng.equals(result))

def test_to_datetime_dt64s(self):
in_bound_dts = [
np.datetime64('2000-01-01'),
np.datetime64('2000-01-02'),
]

for dt in in_bound_dts:
self.assertEqual(
pd.to_datetime(dt),
Timestamp(dt)
)

oob_dts = [
np.datetime64('1000-01-01'),
np.datetime64('5000-01-02'),
]

for dt in oob_dts:
self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise')
self.assertRaises(ValueError, tslib.Timestamp, dt)
self.assert_(pd.to_datetime(dt, coerce=True) is NaT)

def test_to_datetime_array_of_dt64s(self):
dts = [
np.datetime64('2000-01-01'),
np.datetime64('2000-01-02'),
]

# Assuming all datetimes are in bounds, to_datetime() returns
# an array that is equal to Timestamp() parsing
self.assert_(
np.array_equal(
pd.to_datetime(dts, box=False),
np.array([Timestamp(x).asm8 for x in dts])
)
)

# A list of datetimes where the last one is out of bounds
dts_with_oob = dts + [np.datetime64('9999-01-01')]

self.assertRaises(
ValueError,
pd.to_datetime,
dts_with_oob,
coerce=False,
errors='raise'
)

self.assert_(
np.array_equal(
pd.to_datetime(dts_with_oob, box=False, coerce=True),
np.array(
[
Timestamp(dts_with_oob[0]).asm8,
Timestamp(dts_with_oob[1]).asm8,
iNaT,
],
dtype='M8'
)
)
)

# With coerce=False and errors='ignore', out of bounds datetime64s
# are converted to their .item(), which depending on the version of
# numpy is either a python datetime.datetime or datetime.date
self.assert_(
np.array_equal(
pd.to_datetime(dts_with_oob, box=False, coerce=False),
np.array(
[dt.item() for dt in dts_with_oob],
dtype='O'
)
)
)

def test_index_to_datetime(self):
idx = Index(['1/1/2000', '1/2/2000', '1/3/2000'])

Expand Down
108 changes: 86 additions & 22 deletions pandas/tseries/tests/test_tslib.py
Expand Up @@ -4,7 +4,7 @@
import numpy as np

from pandas import tslib
from datetime import datetime
import datetime

from pandas.core.api import Timestamp

Expand All @@ -15,19 +15,53 @@
from pandas import _np_version_under1p7


class TestDatetimeParsingWrappers(unittest.TestCase):
def test_verify_datetime_bounds(self):
for year in (1, 1000, 1677, 2262, 5000):
dt = datetime(year, 1, 1)
self.assertRaises(
ValueError,
tslib.verify_datetime_bounds,
dt
)
class TestTimestamp(unittest.TestCase):
def test_bounds_with_different_units(self):
out_of_bounds_dates = (
'1677-09-21',
'2262-04-12',
)

time_units = ('D', 'h', 'm', 's', 'ms', 'us')

for year in (1678, 2000, 2261):
tslib.verify_datetime_bounds(datetime(year, 1, 1))
for date_string in out_of_bounds_dates:
for unit in time_units:
self.assertRaises(
ValueError,
tslib.Timestamp,
np.datetime64(date_string, dtype='M8[%s]' % unit)
)

in_bounds_dates = (
'1677-09-23',
'2262-04-11',
)

for date_string in in_bounds_dates:
for unit in time_units:
tslib.Timestamp(
np.datetime64(date_string, dtype='M8[%s]' % unit)
)

def test_barely_oob_dts(self):
one_us = np.timedelta64(1)

# By definition we can't go out of bounds in [ns], so we
# convert the datetime64s to [us] so we can go out of bounds
min_ts_us = np.datetime64(tslib.Timestamp.min).astype('M8[us]')
max_ts_us = np.datetime64(tslib.Timestamp.max).astype('M8[us]')

# No error for the min/max datetimes
tslib.Timestamp(min_ts_us)
tslib.Timestamp(max_ts_us)

# One us less than the minimum is an error
self.assertRaises(ValueError, tslib.Timestamp, min_ts_us - one_us)

# One us more than the maximum is an error
self.assertRaises(ValueError, tslib.Timestamp, max_ts_us + one_us)

class TestDatetimeParsingWrappers(unittest.TestCase):
def test_does_not_convert_mixed_integer(self):
bad_date_strings = (
'-50000',
Expand Down Expand Up @@ -97,15 +131,45 @@ def test_number_looking_strings_not_into_datetime(self):
arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

def test_dates_outside_of_datetime64_ns_bounds(self):
# These datetimes are outside of the bounds of the
# datetime64[ns] bounds, so they cannot be converted to
# datetimes
arr = np.array(['1/1/1676', '1/2/1676'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
def test_coercing_dates_outside_of_datetime64_ns_bounds(self):
invalid_dates = [
datetime.date(1000, 1, 1),
datetime.datetime(1000, 1, 1),
'1000-01-01',
'Jan 1, 1000',
np.datetime64('1000-01-01'),
]

arr = np.array(['1/1/2263', '1/2/2263'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
for invalid_date in invalid_dates:
self.assertRaises(
ValueError,
tslib.array_to_datetime,
np.array([invalid_date], dtype='object'),
coerce=False,
raise_=True,
)
self.assert_(
np.array_equal(
tslib.array_to_datetime(
np.array([invalid_date], dtype='object'), coerce=True
),
np.array([tslib.iNaT], dtype='M8[ns]')
)
)

arr = np.array(['1/1/1000', '1/1/2000'], dtype=object)
self.assert_(
np.array_equal(
tslib.array_to_datetime(arr, coerce=True),
np.array(
[
tslib.iNaT,
'2000-01-01T00:00:00.000000000-0000'
],
dtype='M8[ns]'
)
)
)

def test_coerce_of_invalid_datetimes(self):
arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object)
Expand All @@ -130,11 +194,11 @@ def test_coerce_of_invalid_datetimes(self):
)


class TestTimestamp(unittest.TestCase):
class TestTimestampNsOperations(unittest.TestCase):
def setUp(self):
if _np_version_under1p7:
raise nose.SkipTest('numpy >= 1.7 required')
self.timestamp = Timestamp(datetime.utcnow())
self.timestamp = Timestamp(datetime.datetime.utcnow())

def assert_ns_timedelta(self, modified_timestamp, expected_value):
value = self.timestamp.value
Expand Down

0 comments on commit 6d6f392

Please sign in to comment.