Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Datetime/Timestamp.normalize for timezone naive datetimes #23634

Merged
merged 19 commits into from
Nov 18, 2018
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def time_replace_None(self, tz):
def time_to_pydatetime(self, tz):
self.ts.to_pydatetime()

def time_normalize(self, tz):
self.ts.normalize()


class TimestampAcrossDst(object):
def setup(self):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,7 @@ Performance Improvements
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive datetimes (:issue:`23634`)


.. _whatsnew_0240.docs:
Expand Down
30 changes: 10 additions & 20 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ from np_datetime cimport (check_dts_bounds,
npy_datetime,
dt64_to_dtstruct, dtstruct_to_dt64,
get_datetime64_unit, get_datetime64_value,
pydatetime_to_dt64, NPY_DATETIMEUNIT, NPY_FR_ns)
pydatetime_to_dt64, NPY_DATETIMEUNIT, NPY_FR_ns,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is DAY_S, you mean DAY_NS right? let's write out these constants.

DAY_S)
from np_datetime import OutOfBoundsDatetime

from util cimport (is_string_object,
Expand All @@ -41,7 +42,6 @@ from nattype cimport NPY_NAT, checknull_with_nat
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL
cdef int64_t HOURS_NS = 3600000000000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should prob move this one too (future ok)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move this one as well

NS_DTYPE = np.dtype('M8[ns]')
TD_DTYPE = np.dtype('m8[ns]')
Expand Down Expand Up @@ -931,10 +931,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
result_b[:] = NPY_NAT

idx_shifted_left = (np.maximum(0, trans.searchsorted(
vals - DAY_NS, side='right') - 1)).astype(np.int64)
vals - DAY_S * 1000000000, side='right') - 1)).astype(np.int64)

idx_shifted_right = (np.maximum(0, trans.searchsorted(
vals + DAY_NS, side='right') - 1)).astype(np.int64)
vals + DAY_S * 1000000000, side='right') - 1)).astype(np.int64)

for i in range(n):
val = vals[i]
Expand Down Expand Up @@ -1116,9 +1116,9 @@ def normalize_date(dt: object) -> datetime:
@cython.boundscheck(False)
def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
"""
Normalize each of the (nanosecond) timestamps in the given array by
rounding down to the beginning of the day (i.e. midnight). If `tz`
is not None, then this is midnight for this timezone.
Normalize each of the (nanosecond) timezone aware timestamps in the given
array by rounding down to the beginning of the day (i.e. midnight).
This is midnight for timezone, `tz`.

Parameters
----------
Expand All @@ -1130,21 +1130,11 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
result : int64 ndarray of converted of normalized nanosecond timestamps
"""
cdef:
Py_ssize_t i, n = len(stamps)
npy_datetimestruct dts
Py_ssize_t n = len(stamps)
int64_t[:] result = np.empty(n, dtype=np.int64)

if tz is not None:
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this case never reached?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with nogil:
for i in range(n):
if stamps[i] == NPY_NAT:
result[i] = NPY_NAT
continue
dt64_to_dtstruct(stamps[i], &dts)
result[i] = _normalized_stamp(&dts)
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)

return result.base # .base to access underlying np.ndarray

Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL
from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek,
get_week_of_year, get_day_of_year)
from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct,
dt64_to_dtstruct, td64_to_tdstruct)
dt64_to_dtstruct, td64_to_tdstruct, DAY_S)
from nattype cimport NPY_NAT


Expand All @@ -36,7 +36,7 @@ def get_time_micros(ndarray[int64_t] dtindex):
cdef:
ndarray[int64_t] micros

micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL
micros = np.mod(dtindex, DAY_S * 1000000000, dtype=np.int64) // 1000LL
return micros


Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ cdef extern from "src/datetime/np_datetime.h":
NPY_DATETIMEUNIT fr,
npy_datetimestruct *result) nogil

cdef int64_t DAY_S

cdef int reverse_ops[6]

Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ cdef extern from "src/datetime/np_datetime_strings.h":
npy_datetimestruct *out,
int *out_local, int *out_tzoffset)

# ----------------------------------------------------------------------
# time constants

cdef int64_t DAY_S = 86400
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's write this out to DAY_SECONDS

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the place for these may be ccalendar



# ----------------------------------------------------------------------
# numpy object inspection

Expand Down
8 changes: 3 additions & 5 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ from util cimport (is_timedelta64_object, is_datetime64_object,
is_string_object)

from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct,
pandas_timedeltastruct)
pandas_timedeltastruct, DAY_S)

from nattype import nat_strings, NaT
from nattype cimport checknull_with_nat, NPY_NAT
Expand All @@ -38,8 +38,6 @@ from offsets cimport to_offset
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL

# components named tuple
Components = collections.namedtuple('Components', [
'days', 'hours', 'minutes', 'seconds',
Expand Down Expand Up @@ -266,10 +264,10 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
m = 1000000000L * 2629746
p = 9
elif unit == 'W':
m = 1000000000L * 86400 * 7
m = 1000000000L * DAY_S * 7
p = 9
elif unit == 'D' or unit == 'd':
m = 1000000000L * 86400
m = 1000000000L * DAY_S
p = 9
elif unit == 'h':
m = 1000000000L * 3600
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ from timezones cimport (
# Constants
_zero_time = datetime_time(0, 0)
_no_input = object()
cdef int64_t DAY_NS = 86400000000000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have DAY_NS defined in lots of places, can you move to 1

(bamboo-dev) jreback@dev:~/pandas-dev$ grep -r 86400 pandas/_libs/ --include '*.pyx'
pandas/_libs/tslibs/period.pyx:        {1, 24, 1440, 86400, 86400000, 86400000000, 86400000000000},
pandas/_libs/tslibs/period.pyx:        seconds = unix_date * 86400 + dts.hour * 3600 + dts.min * 60 + dts.sec
pandas/_libs/tslibs/period.pyx:        abstime += 86400
pandas/_libs/tslibs/period.pyx:    while abstime >= 86400:
pandas/_libs/tslibs/period.pyx:        abstime -= 86400
pandas/_libs/tslibs/period.pyx:    # abstime >= 0.0 and abstime <= 86400
pandas/_libs/tslibs/conversion.pyx:cdef int64_t DAY_NS = 86400000000000LL
pandas/_libs/tslibs/timedeltas.pyx:cdef int64_t DAY_NS = 86400000000000LL
pandas/_libs/tslibs/timedeltas.pyx:        m = 1000000000L * 86400 * 7
pandas/_libs/tslibs/timedeltas.pyx:        m = 1000000000L * 86400
pandas/_libs/tslibs/timedeltas.pyx:        86400000000042
pandas/_libs/tslibs/fields.pyx:    micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL
pandas/_libs/tslibs/src/datetime/np_datetime.c:    npy_int64 DAY_NS = 86400000000000LL;

prob should be in np_datetime.pyx (and import from there)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move to the same place you have DAY_SECONDS



# ----------------------------------------------------------------------
Expand Down Expand Up @@ -1285,6 +1286,8 @@ class Timestamp(_Timestamp):
Normalize Timestamp to midnight, preserving
tz information.
"""
if self.tz is None:
return Timestamp(self.value - (self.value % DAY_NS))
normalized_value = normalize_i8_timestamps(
np.array([self.value], dtype='i8'), tz=self.tz)[0]
return Timestamp(normalized_value).tz_localize(self.tz)
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,14 @@ def normalize(self):
'2014-08-01 00:00:00+05:30'],
dtype='datetime64[ns, Asia/Calcutta]', freq=None)
"""
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
if self.tz is None:
not_null = self.notnull()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be notna? (does DatetimeArray even have notna or notnull?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It (DatetimeIndex) apparently has notnull, but not sure if i should be using notna or notnull

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use notna

DAY_NS = 86400000000000
new_values = self.asi8.copy()
adjustment = (new_values[not_null] % DAY_NS)
new_values[not_null] = new_values[not_null] - adjustment
else:
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
return type(self)(new_values, freq='infer').tz_localize(self.tz)

def to_period(self, freq=None):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/datetimes/test_scalar_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ def test_normalize(self):
assert result.is_normalized
assert not rng.is_normalized

def test_normalize_nat(self):
dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')])
result = dti.normalize()
expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')])
tm.assert_index_equal(result, expected)


class TestDateTimeIndexToJulianDate(object):

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/scalar/timestamp/test_unary_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,17 @@ def test_replace_dst_border(self):
expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago')
assert result == expected

# --------------------------------------------------------------
# Timestamp.normalize

@pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a normalize_nat test as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't define normalize for NaT.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could have one for Timstamp mirroring (another issue). Probably would just return NaT

def test_normalize(self, tz_naive_fixture, arg):
tz = tz_naive_fixture
ts = Timestamp(arg, tz=tz)
result = ts.normalize()
expected = Timestamp('2013-11-30', tz=tz)
assert result == expected

# --------------------------------------------------------------

@td.skip_if_windows
Expand Down