diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 43e513c9d03f5..079766f0bc635 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -12,7 +12,7 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ -- +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - - diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index e7dabb94f8975..77ce8e4ed4127 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -20,6 +20,7 @@ except: except: from _dummy_thread import allocate_lock as _thread_allocate_lock +import pytz from cython cimport Py_ssize_t from cpython cimport PyFloat_Check @@ -40,6 +41,27 @@ from util cimport is_string_object from nattype cimport checknull_with_nat, NPY_NAT from nattype import nat_strings +cdef dict _parse_code_table = {'y': 0, + 'Y': 1, + 'm': 2, + 'B': 3, + 'b': 4, + 'd': 5, + 'H': 6, + 'I': 7, + 'M': 8, + 'S': 9, + 'f': 10, + 'A': 11, + 'a': 12, + 'w': 13, + 'j': 14, + 'U': 15, + 'W': 16, + 'Z': 17, + 'p': 18, # an additional key, only with I + 'z': 19} + def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): @@ -58,15 +80,15 @@ def array_strptime(ndarray[object] values, object fmt, Py_ssize_t i, n = len(values) pandas_datetimestruct dts ndarray[int64_t] iresult - int year, month, day, minute, hour, second, weekday, julian, tz - int week_of_year, week_of_year_start + ndarray[object] result_timezone + int year, month, day, minute, hour, second, weekday, julian + int week_of_year, week_of_year_start, parse_code, ordinal int64_t us, ns - object val, group_key, ampm, found + object val, group_key, ampm, found, timezone dict found_key bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' - int ordinal assert is_raise or is_ignore or is_coerce @@ -79,6 +101,8 @@ def array_strptime(ndarray[object] values, object fmt, in fmt): raise ValueError("Cannot use '%W' or '%U' without " "day and year") + elif '%Z' in fmt and '%z' in fmt: + raise ValueError("Cannot parse both %Z and %z") global _TimeRE_cache, _regex_cache with _cache_lock: @@ -108,32 +132,10 @@ def array_strptime(ndarray[object] values, object fmt, result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') + result_timezone = np.empty(n, dtype='object') dts.us = dts.ps = dts.as = 0 - cdef dict _parse_code_table = { - 'y': 0, - 'Y': 1, - 'm': 2, - 'B': 3, - 'b': 4, - 'd': 5, - 'H': 6, - 'I': 7, - 'M': 8, - 'S': 9, - 'f': 10, - 'A': 11, - 'a': 12, - 'w': 13, - 'j': 14, - 'U': 15, - 'W': 16, - 'Z': 17, - 'p': 18 # just an additional key, works only with I - } - cdef int parse_code - for i in range(n): val = values[i] if is_string_object(val): @@ -176,7 +178,7 @@ def array_strptime(ndarray[object] values, object fmt, year = 1900 month = day = 1 hour = minute = second = ns = us = 0 - tz = -1 + timezone = None # Default to -1 to signify that values not known; not critical to have, # though week_of_year = -1 @@ -266,21 +268,10 @@ def array_strptime(ndarray[object] values, object fmt, # W starts week on Monday. week_of_year_start = 0 elif parse_code == 17: - # Since -1 is default value only need to worry about setting tz - # if it can be something other than -1. - found_zone = found_dict['Z'].lower() - for value, tz_values in enumerate(locale_time.timezone): - if found_zone in tz_values: - # Deal w/ bad locale setup where timezone names are the - # same and yet time.daylight is true; too ambiguous to - # be able to tell what timezone has daylight savings - if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ( - "utc", "gmt")): - break - else: - tz = value - break + timezone = pytz.timezone(found_dict['Z']) + elif parse_code == 19: + timezone = parse_timezone_directive(found_dict['z']) + # If we know the wk of the year and what day of that wk, we can figure # out the Julian day of the year. if julian == -1 and week_of_year != -1 and weekday != -1: @@ -330,7 +321,9 @@ def array_strptime(ndarray[object] values, object fmt, continue raise - return result + result_timezone[i] = timezone + + return result, result_timezone """_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored @@ -538,14 +531,13 @@ class TimeRE(dict): # XXX: Does 'Y' need to worry about having less or more than # 4 digits? 'Y': r"(?P\d\d\d\d)", + 'z': r"(?P[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)", 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone - for tz in tz_names], - 'Z'), + 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) @@ -632,3 +624,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, else: days_to_week = week_0_length + (7 * (week_of_year - 1)) return 1 + days_to_week + day_of_week + +cdef parse_timezone_directive(object z): + """ + Parse the '%z' directive and return a pytz.FixedOffset + + Parameters + ---------- + z : string of the UTC offset + + Returns + ------- + pytz.FixedOffset + + Notes + ----- + This is essentially similar to the cpython implementation + https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479 + """ + + cdef: + int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds + int total_minutes + object gmtoff_remainder, gmtoff_remainder_padding + + if z == 'Z': + return pytz.FixedOffset(0) + if z[3] == ':': + z = z[:3] + z[4:] + if len(z) > 5: + if z[5] != ':': + msg = "Inconsistent use of : in {0}" + raise ValueError(msg.format(z)) + z = z[:5] + z[6:] + hours = int(z[1:3]) + minutes = int(z[3:5]) + seconds = int(z[5:7] or 0) + + # Pad to always return microseconds. + gmtoff_remainder = z[8:] + pad_number = 6 - len(gmtoff_remainder) + gmtoff_remainder_padding = "0" * pad_number + microseconds = int(gmtoff_remainder + gmtoff_remainder_padding) + + total_minutes = ((hours * 60) + minutes + (seconds / 60) + + (microseconds / 60000000)) + total_minutes = -total_minutes if z.startswith("-") else total_minutes + return pytz.FixedOffset(total_minutes) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1de43116d0b49..5da79c7754411 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,7 +1,8 @@ from datetime import datetime, timedelta, time -import numpy as np from collections import MutableMapping +import numpy as np + from pandas._libs import tslib from pandas._libs.tslibs.strptime import array_strptime from pandas._libs.tslibs import parsing, conversion @@ -27,6 +28,7 @@ ABCDataFrame) from pandas.core.dtypes.missing import notna from pandas.core import algorithms +from pandas.compat import zip def _guess_datetime_format_for_array(arr, **kwargs): @@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): return result.values +def _return_parsed_timezone_results(result, timezones, box, tz): + """ + Return results from array_strptime if a %z or %Z directive was passed. + + Parameters + ---------- + result : ndarray + int64 date representations of the dates + timezones : ndarray + pytz timezone objects + box : boolean + True boxes result as an Index-like, False returns an ndarray + tz : object + None or pytz timezone object + Returns + ------- + tz_result : ndarray of parsed dates with timezone + Returns: + + - Index-like if box=True + - ndarray of Timestamps if box=False + + """ + if tz is not None: + raise ValueError("Cannot pass a tz argument when " + "parsing strings with timezone " + "information.") + tz_results = np.array([tslib.Timestamp(res).tz_localize(zone) for res, zone + in zip(result, timezones)]) + if box: + from pandas import Index + return Index(tz_results) + return tz_results + + def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', @@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): # fallback if result is None: try: - result = array_strptime(arg, format, exact=exact, - errors=errors) + result, timezones = array_strptime( + arg, format, exact=exact, errors=errors) + if '%Z' in format or '%z' in format: + return _return_parsed_timezone_results( + result, timezones, box, tz) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8b0514764b0c0..e09c1b3f19d1a 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -179,6 +179,59 @@ def test_to_datetime_format_weeks(self, cache): for s, format, dt in data: assert to_datetime(s, format=format, cache=cache) == dt + @pytest.mark.parametrize("box,const,assert_equal", [ + [True, pd.Index, 'assert_index_equal'], + [False, np.array, 'assert_numpy_array_equal']]) + @pytest.mark.parametrize("fmt,dates,expected_dates", [ + ['%Y-%m-%d %H:%M:%S %Z', + ['2010-01-01 12:00:00 UTC'] * 2, + [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], + ['%Y-%m-%d %H:%M:%S %Z', + ['2010-01-01 12:00:00 UTC', + '2010-01-01 12:00:00 GMT', + '2010-01-01 12:00:00 US/Pacific'], + [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), + pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), + pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], + ['%Y-%m-%d %H:%M:%S%z', + ['2010-01-01 12:00:00+0100'] * 2, + [pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(60))] * 2], + ['%Y-%m-%d %H:%M:%S %z', + ['2010-01-01 12:00:00 +0100'] * 2, + [pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(60))] * 2], + ['%Y-%m-%d %H:%M:%S %z', + ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], + [pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(60)), + pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(-60))]], + ['%Y-%m-%d %H:%M:%S %z', + ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], + [pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC + pd.Timestamp('2010-01-01 12:00:00', + tzinfo=pytz.FixedOffset(0))]]]) + def test_to_datetime_parse_tzname_or_tzoffset(self, box, const, + assert_equal, fmt, + dates, expected_dates): + # GH 13486 + result = pd.to_datetime(dates, format=fmt, box=box) + expected = const(expected_dates) + getattr(tm, assert_equal)(result, expected) + + with pytest.raises(ValueError): + pd.to_datetime(dates, format=fmt, box=box, utc=True) + + @pytest.mark.parametrize('offset', [ + '+0', '-1foo', 'UTCbar', ':10', '+01:000:01', '']) + def test_to_datetime_parse_timezone_malformed(self, offset): + fmt = '%Y-%m-%d %H:%M:%S %z' + date = '2010-01-01 12:00:00 ' + offset + with pytest.raises(ValueError): + pd.to_datetime([date], format=fmt) + class TestToDatetime(object): def test_to_datetime_pydatetime(self):