Skip to content

Commit

Permalink
ENH: Parse %z and %Z directive in format for to_datetime (#19979)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and jreback committed May 29, 2018
1 parent 36c1f6b commit 7b1f9bf
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 51 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ New features

Other Enhancements
^^^^^^^^^^^^^^^^^^
-
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
-
-

Expand Down
133 changes: 86 additions & 47 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ except:
except:
from _dummy_thread import allocate_lock as _thread_allocate_lock

import pytz

from cython cimport Py_ssize_t
from cpython cimport PyFloat_Check
Expand All @@ -40,6 +41,27 @@ from util cimport is_string_object
from nattype cimport checknull_with_nat, NPY_NAT
from nattype import nat_strings

cdef dict _parse_code_table = {'y': 0,
'Y': 1,
'm': 2,
'B': 3,
'b': 4,
'd': 5,
'H': 6,
'I': 7,
'M': 8,
'S': 9,
'f': 10,
'A': 11,
'a': 12,
'w': 13,
'j': 14,
'U': 15,
'W': 16,
'Z': 17,
'p': 18, # an additional key, only with I
'z': 19}


def array_strptime(ndarray[object] values, object fmt,
bint exact=True, errors='raise'):
Expand All @@ -58,15 +80,15 @@ def array_strptime(ndarray[object] values, object fmt,
Py_ssize_t i, n = len(values)
pandas_datetimestruct dts
ndarray[int64_t] iresult
int year, month, day, minute, hour, second, weekday, julian, tz
int week_of_year, week_of_year_start
ndarray[object] result_timezone
int year, month, day, minute, hour, second, weekday, julian
int week_of_year, week_of_year_start, parse_code, ordinal
int64_t us, ns
object val, group_key, ampm, found
object val, group_key, ampm, found, timezone
dict found_key
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
int ordinal

assert is_raise or is_ignore or is_coerce

Expand All @@ -79,6 +101,8 @@ def array_strptime(ndarray[object] values, object fmt,
in fmt):
raise ValueError("Cannot use '%W' or '%U' without "
"day and year")
elif '%Z' in fmt and '%z' in fmt:
raise ValueError("Cannot parse both %Z and %z")

global _TimeRE_cache, _regex_cache
with _cache_lock:
Expand Down Expand Up @@ -108,32 +132,10 @@ def array_strptime(ndarray[object] values, object fmt,

result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
result_timezone = np.empty(n, dtype='object')

dts.us = dts.ps = dts.as = 0

cdef dict _parse_code_table = {
'y': 0,
'Y': 1,
'm': 2,
'B': 3,
'b': 4,
'd': 5,
'H': 6,
'I': 7,
'M': 8,
'S': 9,
'f': 10,
'A': 11,
'a': 12,
'w': 13,
'j': 14,
'U': 15,
'W': 16,
'Z': 17,
'p': 18 # just an additional key, works only with I
}
cdef int parse_code

for i in range(n):
val = values[i]
if is_string_object(val):
Expand Down Expand Up @@ -176,7 +178,7 @@ def array_strptime(ndarray[object] values, object fmt,
year = 1900
month = day = 1
hour = minute = second = ns = us = 0
tz = -1
timezone = None
# Default to -1 to signify that values not known; not critical to have,
# though
week_of_year = -1
Expand Down Expand Up @@ -266,21 +268,10 @@ def array_strptime(ndarray[object] values, object fmt,
# W starts week on Monday.
week_of_year_start = 0
elif parse_code == 17:
# Since -1 is default value only need to worry about setting tz
# if it can be something other than -1.
found_zone = found_dict['Z'].lower()
for value, tz_values in enumerate(locale_time.timezone):
if found_zone in tz_values:
# Deal w/ bad locale setup where timezone names are the
# same and yet time.daylight is true; too ambiguous to
# be able to tell what timezone has daylight savings
if (time.tzname[0] == time.tzname[1] and
time.daylight and found_zone not in (
"utc", "gmt")):
break
else:
tz = value
break
timezone = pytz.timezone(found_dict['Z'])
elif parse_code == 19:
timezone = parse_timezone_directive(found_dict['z'])

# If we know the wk of the year and what day of that wk, we can figure
# out the Julian day of the year.
if julian == -1 and week_of_year != -1 and weekday != -1:
Expand Down Expand Up @@ -330,7 +321,9 @@ def array_strptime(ndarray[object] values, object fmt,
continue
raise

return result
result_timezone[i] = timezone

return result, result_timezone


"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
Expand Down Expand Up @@ -538,14 +531,13 @@ class TimeRE(dict):
# XXX: Does 'Y' need to worry about having less or more than
# 4 digits?
'Y': r"(?P<Y>\d\d\d\d)",
'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone
for tz in tz_names],
'Z'),
'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
'%': '%'})
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
Expand Down Expand Up @@ -632,3 +624,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year,
else:
days_to_week = week_0_length + (7 * (week_of_year - 1))
return 1 + days_to_week + day_of_week

cdef parse_timezone_directive(object z):
"""
Parse the '%z' directive and return a pytz.FixedOffset
Parameters
----------
z : string of the UTC offset
Returns
-------
pytz.FixedOffset
Notes
-----
This is essentially similar to the cpython implementation
https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
"""

cdef:
int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
int total_minutes
object gmtoff_remainder, gmtoff_remainder_padding

if z == 'Z':
return pytz.FixedOffset(0)
if z[3] == ':':
z = z[:3] + z[4:]
if len(z) > 5:
if z[5] != ':':
msg = "Inconsistent use of : in {0}"
raise ValueError(msg.format(z))
z = z[:5] + z[6:]
hours = int(z[1:3])
minutes = int(z[3:5])
seconds = int(z[5:7] or 0)

# Pad to always return microseconds.
gmtoff_remainder = z[8:]
pad_number = 6 - len(gmtoff_remainder)
gmtoff_remainder_padding = "0" * pad_number
microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)

total_minutes = ((hours * 60) + minutes + (seconds / 60) +
(microseconds / 60000000))
total_minutes = -total_minutes if z.startswith("-") else total_minutes
return pytz.FixedOffset(total_minutes)
46 changes: 43 additions & 3 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime, timedelta, time
import numpy as np
from collections import MutableMapping

import numpy as np

from pandas._libs import tslib
from pandas._libs.tslibs.strptime import array_strptime
from pandas._libs.tslibs import parsing, conversion
Expand All @@ -27,6 +28,7 @@
ABCDataFrame)
from pandas.core.dtypes.missing import notna
from pandas.core import algorithms
from pandas.compat import zip


def _guess_datetime_format_for_array(arr, **kwargs):
Expand Down Expand Up @@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
return result.values


def _return_parsed_timezone_results(result, timezones, box, tz):
"""
Return results from array_strptime if a %z or %Z directive was passed.
Parameters
----------
result : ndarray
int64 date representations of the dates
timezones : ndarray
pytz timezone objects
box : boolean
True boxes result as an Index-like, False returns an ndarray
tz : object
None or pytz timezone object
Returns
-------
tz_result : ndarray of parsed dates with timezone
Returns:
- Index-like if box=True
- ndarray of Timestamps if box=False
"""
if tz is not None:
raise ValueError("Cannot pass a tz argument when "
"parsing strings with timezone "
"information.")
tz_results = np.array([tslib.Timestamp(res).tz_localize(zone) for res, zone
in zip(result, timezones)])
if box:
from pandas import Index
return Index(tz_results)
return tz_results


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix',
Expand Down Expand Up @@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
# fallback
if result is None:
try:
result = array_strptime(arg, format, exact=exact,
errors=errors)
result, timezones = array_strptime(
arg, format, exact=exact, errors=errors)
if '%Z' in format or '%z' in format:
return _return_parsed_timezone_results(
result, timezones, box, tz)
except tslib.OutOfBoundsDatetime:
if errors == 'raise':
raise
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,59 @@ def test_to_datetime_format_weeks(self, cache):
for s, format, dt in data:
assert to_datetime(s, format=format, cache=cache) == dt

@pytest.mark.parametrize("box,const,assert_equal", [
[True, pd.Index, 'assert_index_equal'],
[False, np.array, 'assert_numpy_array_equal']])
@pytest.mark.parametrize("fmt,dates,expected_dates", [
['%Y-%m-%d %H:%M:%S %Z',
['2010-01-01 12:00:00 UTC'] * 2,
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
['%Y-%m-%d %H:%M:%S %Z',
['2010-01-01 12:00:00 UTC',
'2010-01-01 12:00:00 GMT',
'2010-01-01 12:00:00 US/Pacific'],
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
['%Y-%m-%d %H:%M:%S%z',
['2010-01-01 12:00:00+0100'] * 2,
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60))] * 2],
['%Y-%m-%d %H:%M:%S %z',
['2010-01-01 12:00:00 +0100'] * 2,
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60))] * 2],
['%Y-%m-%d %H:%M:%S %z',
['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(60)),
pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(-60))]],
['%Y-%m-%d %H:%M:%S %z',
['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
[pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC
pd.Timestamp('2010-01-01 12:00:00',
tzinfo=pytz.FixedOffset(0))]]])
def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
assert_equal, fmt,
dates, expected_dates):
# GH 13486
result = pd.to_datetime(dates, format=fmt, box=box)
expected = const(expected_dates)
getattr(tm, assert_equal)(result, expected)

with pytest.raises(ValueError):
pd.to_datetime(dates, format=fmt, box=box, utc=True)

@pytest.mark.parametrize('offset', [
'+0', '-1foo', 'UTCbar', ':10', '+01:000:01', ''])
def test_to_datetime_parse_timezone_malformed(self, offset):
fmt = '%Y-%m-%d %H:%M:%S %z'
date = '2010-01-01 12:00:00 ' + offset
with pytest.raises(ValueError):
pd.to_datetime([date], format=fmt)


class TestToDatetime(object):
def test_to_datetime_pydatetime(self):
Expand Down

0 comments on commit 7b1f9bf

Please sign in to comment.