Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Parse %z and %Z directive in format for to_datetime #19979

Merged
merged 38 commits into from
May 29, 2018
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
4a43815
DOC: update the Index.isin docstring (#20249)
noemielteto Mar 18, 2018
cb47c08
ENH: Parse %z directive in format for to_datetime
mroeschke Feb 22, 2018
f299aec
move parsing to a sub function, add additional test
mroeschke Mar 14, 2018
259ec8f
Address comments
mroeschke Mar 15, 2018
77af4db
timezone compat
mroeschke Mar 15, 2018
54c2491
add empty line for strptime.pyx
mroeschke Mar 15, 2018
0e2a0cd
add issue number and try except
mroeschke Mar 19, 2018
d31e141
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke Mar 29, 2018
7bdbdf4
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke Mar 30, 2018
3e3d5c6
add whatsnew
mroeschke Mar 30, 2018
c16ef8c
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke Mar 31, 2018
6f0b7f0
remove weird pd file
mroeschke Mar 31, 2018
0525823
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke Apr 18, 2018
4c22808
Remove blank line
mroeschke Apr 18, 2018
24e1c0a
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke Apr 22, 2018
4f2f865
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 6, 2018
145e5da
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 16, 2018
64bc3fc
Use pytz zones only
mroeschke May 16, 2018
47a9d69
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 16, 2018
1b44554
Rework test to only expect pytz after parsing
mroeschke May 17, 2018
0dcc59f
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 17, 2018
149781b
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 18, 2018
d99ef5a
Clean up and flake8 fix
mroeschke May 18, 2018
0e5e3c6
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 22, 2018
9a2ea19
Add additional unbalanced colon test
mroeschke May 22, 2018
924859e
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 23, 2018
a1599a0
allow parsing of any pytz
mroeschke May 23, 2018
6c80c2e
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 23, 2018
abccc3e
move error handling
mroeschke May 23, 2018
473a0f4
Lint
mroeschke May 23, 2018
ab0a692
Small cleanup
mroeschke May 24, 2018
56fc683
Lint
mroeschke May 24, 2018
85bd45e
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 25, 2018
eb2a661
Add additional test and move whatsnew to v0.24
mroeschke May 25, 2018
5500ca8
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 25, 2018
0e0d0fd
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 26, 2018
34f638c
Merge remote-tracking branch 'upstream/master' into strftime_timezone
mroeschke May 28, 2018
757458d
Add comment that FixedOffset(0) is UTC
mroeschke May 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ New features

Other Enhancements
^^^^^^^^^^^^^^^^^^
-
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
-
-

Expand Down
133 changes: 86 additions & 47 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ except:
except:
from _dummy_thread import allocate_lock as _thread_allocate_lock

import pytz

from cython cimport Py_ssize_t
from cpython cimport PyFloat_Check
Expand All @@ -40,6 +41,27 @@ from util cimport is_string_object
from nattype cimport checknull_with_nat, NPY_NAT
from nattype import nat_strings

cdef dict _parse_code_table = {'y': 0,
'Y': 1,
'm': 2,
'B': 3,
'b': 4,
'd': 5,
'H': 6,
'I': 7,
'M': 8,
'S': 9,
'f': 10,
'A': 11,
'a': 12,
'w': 13,
'j': 14,
'U': 15,
'W': 16,
'Z': 17,
'p': 18, # an additional key, only with I
'z': 19}


def array_strptime(ndarray[object] values, object fmt,
bint exact=True, errors='raise'):
Expand All @@ -58,15 +80,15 @@ def array_strptime(ndarray[object] values, object fmt,
Py_ssize_t i, n = len(values)
pandas_datetimestruct dts
ndarray[int64_t] iresult
int year, month, day, minute, hour, second, weekday, julian, tz
int week_of_year, week_of_year_start
ndarray[object] result_timezone
int year, month, day, minute, hour, second, weekday, julian
int week_of_year, week_of_year_start, parse_code, ordinal
int64_t us, ns
object val, group_key, ampm, found
object val, group_key, ampm, found, timezone
dict found_key
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
int ordinal

assert is_raise or is_ignore or is_coerce

Expand All @@ -79,6 +101,8 @@ def array_strptime(ndarray[object] values, object fmt,
in fmt):
raise ValueError("Cannot use '%W' or '%U' without "
"day and year")
elif '%Z' in fmt and '%z' in fmt:
raise ValueError("Cannot parse both %Z and %z")

global _TimeRE_cache, _regex_cache
with _cache_lock:
Expand Down Expand Up @@ -108,32 +132,10 @@ def array_strptime(ndarray[object] values, object fmt,

result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
result_timezone = np.empty(n, dtype='object')

dts.us = dts.ps = dts.as = 0

cdef dict _parse_code_table = {
'y': 0,
'Y': 1,
'm': 2,
'B': 3,
'b': 4,
'd': 5,
'H': 6,
'I': 7,
'M': 8,
'S': 9,
'f': 10,
'A': 11,
'a': 12,
'w': 13,
'j': 14,
'U': 15,
'W': 16,
'Z': 17,
'p': 18 # just an additional key, works only with I
}
cdef int parse_code

for i in range(n):
val = values[i]
if is_string_object(val):
Expand Down Expand Up @@ -176,7 +178,7 @@ def array_strptime(ndarray[object] values, object fmt,
year = 1900
month = day = 1
hour = minute = second = ns = us = 0
tz = -1
timezone = None
# Default to -1 to signify that values not known; not critical to have,
# though
week_of_year = -1
Expand Down Expand Up @@ -266,21 +268,10 @@ def array_strptime(ndarray[object] values, object fmt,
# W starts week on Monday.
week_of_year_start = 0
elif parse_code == 17:
# Since -1 is default value only need to worry about setting tz
# if it can be something other than -1.
found_zone = found_dict['Z'].lower()
for value, tz_values in enumerate(locale_time.timezone):
if found_zone in tz_values:
# Deal w/ bad locale setup where timezone names are the
# same and yet time.daylight is true; too ambiguous to
# be able to tell what timezone has daylight savings
if (time.tzname[0] == time.tzname[1] and
time.daylight and found_zone not in (
"utc", "gmt")):
break
else:
tz = value
break
timezone = pytz.timezone(found_dict['Z'])
elif parse_code == 19:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move this whole parse to a function and just all it here (and return the values as a tuple)

timezone = parse_timezone_directive(found_dict['z'])

# If we know the wk of the year and what day of that wk, we can figure
# out the Julian day of the year.
if julian == -1 and week_of_year != -1 and weekday != -1:
Expand Down Expand Up @@ -330,7 +321,9 @@ def array_strptime(ndarray[object] values, object fmt,
continue
raise

return result
result_timezone[i] = timezone

return result, result_timezone


"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
Expand Down Expand Up @@ -538,14 +531,13 @@ class TimeRE(dict):
# XXX: Does 'Y' need to worry about having less or more than
# 4 digits?
'Y': r"(?P<Y>\d\d\d\d)",
'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone
for tz in tz_names],
'Z'),
'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
'%': '%'})
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
Expand Down Expand Up @@ -632,3 +624,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year,
else:
days_to_week = week_0_length + (7 * (week_of_year - 1))
return 1 + days_to_week + day_of_week

cdef parse_timezone_directive(object z):
"""
Parse the '%z' directive and return a pytz.FixedOffset

Parameters
----------
z : string of the UTC offset

Returns
-------
pytz.FixedOffset

Notes
-----
This is essentially similar to the cpython implementation
https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
"""

cdef:
int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
int total_minutes
object gmtoff_remainder, gmtoff_remainder_padding

if z == 'Z':
return pytz.FixedOffset(0)
if z[3] == ':':
z = z[:3] + z[4:]
if len(z) > 5:
if z[5] != ':':
msg = "Inconsistent use of : in {0}"
raise ValueError(msg.format(z))
z = z[:5] + z[6:]
hours = int(z[1:3])
minutes = int(z[3:5])
seconds = int(z[5:7] or 0)

# Pad to always return microseconds.
gmtoff_remainder = z[8:]
pad_number = 6 - len(gmtoff_remainder)
gmtoff_remainder_padding = "0" * pad_number
microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)

total_minutes = ((hours * 60) + minutes + (seconds / 60) +
(microseconds / 60000000))
total_minutes = -total_minutes if z.startswith("-") else total_minutes
return pytz.FixedOffset(total_minutes)
46 changes: 43 additions & 3 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime, timedelta, time
import numpy as np
from collections import MutableMapping

import numpy as np

from pandas._libs import tslib
from pandas._libs.tslibs.strptime import array_strptime
from pandas._libs.tslibs import parsing, conversion
Expand All @@ -27,6 +28,7 @@
ABCDataFrame)
from pandas.core.dtypes.missing import notna
from pandas.core import algorithms
from pandas.compat import zip


def _guess_datetime_format_for_array(arr, **kwargs):
Expand Down Expand Up @@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
return result.values


def _return_parsed_timezone_results(result, timezones, box, tz):
"""
Return results from array_strptime if a %z or %Z directive was passed.

Parameters
----------
result : ndarray
int64 date representations of the dates
timezones : ndarray
pytz timezone objects
box : boolean
True boxes result as an Index-like, False returns an ndarray
tz : object
None or pytz timezone object
Returns
-------
tz_result : ndarray of parsed dates with timezone
Returns:

- Index-like if box=True
- ndarray of Timestamps if box=False

"""
if tz is not None:
raise ValueError("Cannot pass a tz argument when "
"parsing strings with timezone "
"information.")
tz_results = np.array([tslib.Timestamp(res).tz_localize(zone) for res, zone
in zip(result, timezones)])
if box:
from pandas import Index
return Index(tz_results)
return tz_results


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix',
Expand Down Expand Up @@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
# fallback
if result is None:
try:
result = array_strptime(arg, format, exact=exact,
errors=errors)
result, timezones = array_strptime(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would much rather do the error handling in the _return_parsed_timezone_results. This block is just very complicated and hard to grok

arg, format, exact=exact, errors=errors)
if '%Z' in format or '%z' in format:
return _return_parsed_timezone_results(
result, timezones, box, tz)
except tslib.OutOfBoundsDatetime:
if errors == 'raise':
raise
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,59 @@ def test_to_datetime_format_weeks(self, cache):
for s, format, dt in data:
assert to_datetime(s, format=format, cache=cache) == dt

@pytest.mark.parametrize("box,const,assert_equal", [
[True, pd.Index, 'assert_index_equal'],
[False, np.array, 'assert_numpy_array_equal']])
@pytest.mark.parametrize("fmt,dates,expected_dates", [
['%Y-%m-%d %H:%M:%S %Z',
['2010-01-01 12:00:00 UTC'] * 2,
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
['%Y-%m-%d %H:%M:%S %Z',
['2010-01-01 12:00:00 UTC',
'2010-01-01 12:00:00 GMT',
'2010-01-01 12:00:00 US/Pacific'],
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
['%Y-%m-%d %H:%M:%S%z',
['2010-01-01 12:00:00+0100'] * 2,
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60))] * 2],
['%Y-%m-%d %H:%M:%S %z',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you one of them, eg this one, without the space before the tz?

['2010-01-01 12:00:00 +0100'] * 2,
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60))] * 2],
['%Y-%m-%d %H:%M:%S %z',
['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60)),
pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(-60))]],
['%Y-%m-%d %H:%M:%S %z',
['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this also work with %Z?
It seems that with datetime.datetime.strptime it does not work with either

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK (that's probably a newer addition to python), then it makes sense to follow upstream python to be consistent

[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(0)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be UTC or a fixed offset of 0 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytz coerces a fixed offset of 0 to UTC

In [2]: pytz.FixedOffset(0)
Out[2]: <UTC>

But making it explicit here that %z should return pytz.FixedOffset(0)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the actual DatetimeIndex you get here has UTC timezone? OK, that's good! (but maybe add a small comment since I would not expect that)

pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(0))]]])
def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
assert_equal, fmt,
dates, expected_dates):
# GH 13486
result = pd.to_datetime(dates, format=fmt, box=box)
expected = const(expected_dates)
getattr(tm, assert_equal)(result, expected)

with pytest.raises(ValueError):
pd.to_datetime(dates, format=fmt, box=box, utc=True)

@pytest.mark.parametrize('offset', [
'+0', '-1foo', 'UTCbar', ':10', '+01:000:01', ''])
def test_to_datetime_parse_timezone_malformed(self, offset):
fmt = '%Y-%m-%d %H:%M:%S %z'
date = '2010-01-01 12:00:00 ' + offset
with pytest.raises(ValueError):
pd.to_datetime([date], format=fmt)


class TestToDatetime(object):
def test_to_datetime_pydatetime(self):
Expand Down