Merge remote-tracking branch 'upstream/master' into 24986-nested-array

pandas-dev · Jan 29, 2019 · afb1bee · afb1bee
2 parents 86948a1 + 145ade2
commit afb1bee
Show file tree

Hide file tree

Showing 19 changed files with 316 additions and 134 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -989,6 +989,36 @@ a single date rather than the entire array.
 
    os.remove('tmp.csv')
 
+
+.. _io.csv.mixed_timezones:
+
+Parsing a CSV with mixed Timezones
+++++++++++++++++++++++++++++++++++
+
+Pandas cannot natively represent a column or index with mixed timezones. If your CSV
+file contains columns with a mixture of timezones, the default result will be
+an object-dtype column with strings, even with ``parse_dates``.
+
+
+.. ipython:: python
+
+   content = """\
+   a
+   2000-01-01T00:00:00+05:00
+   2000-01-01T00:00:00+06:00"""
+   df = pd.read_csv(StringIO(content), parse_dates=['a'])
+   df['a']
+
+To parse the mixed-timezone values as a datetime column, pass a partially-applied
+:func:`to_datetime` with ``utc=True`` as the ``date_parser``.
+
+.. ipython:: python
+
+   df = pd.read_csv(StringIO(content), parse_dates=['a'],
+                    date_parser=lambda col: pd.to_datetime(col, utc=True))
+   df['a']
+
+
 .. _io.dayfirst:
 
 

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -648,6 +648,52 @@ that the dates have been converted to UTC
     pd.to_datetime(["2015-11-18 15:30:00+05:30",
                     "2015-11-18 16:30:00+06:30"], utc=True)
 
+
+.. _whatsnew_0240.api_breaking.read_csv_mixed_tz:
+
+Parsing mixed-timezones with :func:`read_csv`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`read_csv` no longer silently converts mixed-timezone columns to UTC (:issue:`24987`).
+
+*Previous Behavior*
+
+.. code-block:: python
+
+   >>> import io
+   >>> content = """\
+   ... a
+   ... 2000-01-01T00:00:00+05:00
+   ... 2000-01-01T00:00:00+06:00"""
+   >>> df = pd.read_csv(io.StringIO(content), parse_dates=['a'])
+   >>> df.a
+   0   1999-12-31 19:00:00
+   1   1999-12-31 18:00:00
+   Name: a, dtype: datetime64[ns]
+
+*New Behavior*
+
+.. ipython:: python
+
+   import io
+   content = """\
+   a
+   2000-01-01T00:00:00+05:00
+   2000-01-01T00:00:00+06:00"""
+   df = pd.read_csv(io.StringIO(content), parse_dates=['a'])
+   df.a
+
+As can be seen, the ``dtype`` is object; each value in the column is a string.
+To convert the strings to an array of datetimes, the ``date_parser`` argument
+
+.. ipython:: python
+
+   df = pd.read_csv(io.StringIO(content), parse_dates=['a'],
+                    date_parser=lambda col: pd.to_datetime(col, utc=True))
+   df.a
+
+See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more.
+
 .. _whatsnew_0240.api_breaking.period_end_time:
 
 Time values in ``dt.end_time`` and ``to_timestamp(how='end')``

diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst
@@ -74,6 +74,11 @@ Bug Fixes
 
 - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
 
+**Visualization**
+
+- Fixed the warning for implicitly registered matplotlib converters not showing. See :ref:`whatsnew_0211.converters` for more (:issue:`24963`).
+
+
 **Other**
 
 -

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2058,7 +2058,7 @@ def validate_tz_from_dtype(dtype, tz):
             #  tz-naive dtype (i.e. datetime64[ns])
             if tz is not None and not timezones.tz_compare(tz, dtz):
                 raise ValueError("cannot supply both a tz and a "
-                                 "timezone-naive dtype (i.e. datetime64[ns]")
+                                 "timezone-naive dtype (i.e. datetime64[ns])")
 
     return tz
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -203,9 +203,14 @@
     * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
       result 'foo'
 
-    If a column or index contains an unparseable date, the entire column or
-    index will be returned unaltered as an object data type. For non-standard
-    datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``
+    If a column or index cannot be represented as an array of datetimes,
+    say because of an unparseable value or a mixture of timezones, the column
+    or index will be returned unaltered as an object data type. For
+    non-standard datetime parsing, use ``pd.to_datetime`` after
+    ``pd.read_csv``. To parse an index or column with a mixture of timezones,
+    specify ``date_parser`` to be a partially-applied
+    :func:`pandas.to_datetime` with ``utc=True``. See
+    :ref:`io.csv.mixed_timezones` for more.
 
     Note: A fast-path exists for iso8601-formatted dates.
 infer_datetime_format : bool, default False

diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -39,7 +39,7 @@
 else:
     _HAS_MPL = True
     if get_option('plotting.matplotlib.register_converters'):
-        _converter.register(explicit=True)
+        _converter.register(explicit=False)
 
 
 def _raise_if_no_mpl():

diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
@@ -30,7 +30,12 @@ def setup_indices(self):
 
     def test_pickle_compat_construction(self):
         # need an object to create with
-        pytest.raises(TypeError, self._holder)
+        msg = (r"Index\(\.\.\.\) must be called with a collection of some"
+               r" kind, None was passed|"
+               r"__new__\(\) missing 1 required positional argument: 'data'|"
+               r"__new__\(\) takes at least 2 arguments \(1 given\)")
+        with pytest.raises(TypeError, match=msg):
+            self._holder()
 
     def test_to_series(self):
         # assert that we are creating a copy of the index
@@ -84,8 +89,11 @@ def test_shift(self):
 
         # GH8083 test the base class for shift
         idx = self.create_index()
-        pytest.raises(NotImplementedError, idx.shift, 1)
-        pytest.raises(NotImplementedError, idx.shift, 1, 2)
+        msg = "Not supported for type {}".format(type(idx).__name__)
+        with pytest.raises(NotImplementedError, match=msg):
+            idx.shift(1)
+        with pytest.raises(NotImplementedError, match=msg):
+            idx.shift(1, 2)
 
     def test_create_index_existing_name(self):
 

diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py
@@ -135,8 +135,10 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture):
         tm.assert_index_equal(i2, expected)
 
         # incompat tz/dtype
-        pytest.raises(ValueError, lambda: DatetimeIndex(
-            i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific'))
+        msg = "cannot supply both a tz and a dtype with a tz"
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(i.tz_localize(None).asi8,
+                          dtype=i.dtype, tz='US/Pacific')
 
     def test_construction_index_with_mixed_timezones(self):
         # gh-11488: no tz results in DatetimeIndex
@@ -439,14 +441,19 @@ def test_constructor_coverage(self):
         tm.assert_index_equal(from_ints, expected)
 
         # non-conforming
-        pytest.raises(ValueError, DatetimeIndex,
-                      ['2000-01-01', '2000-01-02', '2000-01-04'], freq='D')
+        msg = ("Inferred frequency None from passed values does not conform"
+               " to passed frequency D")
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'], freq='D')
 
-        pytest.raises(ValueError, date_range, start='2011-01-01',
-                      freq='b')
-        pytest.raises(ValueError, date_range, end='2011-01-01',
-                      freq='B')
-        pytest.raises(ValueError, date_range, periods=10, freq='D')
+        msg = ("Of the four parameters: start, end, periods, and freq, exactly"
+               " three must be specified")
+        with pytest.raises(ValueError, match=msg):
+            date_range(start='2011-01-01', freq='b')
+        with pytest.raises(ValueError, match=msg):
+            date_range(end='2011-01-01', freq='B')
+        with pytest.raises(ValueError, match=msg):
+            date_range(periods=10, freq='D')
 
     @pytest.mark.parametrize('freq', ['AS', 'W-SUN'])
     def test_constructor_datetime64_tzformat(self, freq):
@@ -511,18 +518,20 @@ def test_constructor_dtype(self):
         idx = DatetimeIndex(['2013-01-01', '2013-01-02'],
                             dtype='datetime64[ns, US/Eastern]')
 
-        pytest.raises(ValueError,
-                      lambda: DatetimeIndex(idx,
-                                            dtype='datetime64[ns]'))
+        msg = ("cannot supply both a tz and a timezone-naive dtype"
+               r" \(i\.e\. datetime64\[ns\]\)")
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(idx, dtype='datetime64[ns]')
 
         # this is effectively trying to convert tz's
-        pytest.raises(TypeError,
-                      lambda: DatetimeIndex(idx,
-                                            dtype='datetime64[ns, CET]'))
-        pytest.raises(ValueError,
-                      lambda: DatetimeIndex(
-                          idx, tz='CET',
-                          dtype='datetime64[ns, US/Eastern]'))
+        msg = ("data is already tz-aware US/Eastern, unable to set specified"
+               " tz: CET")
+        with pytest.raises(TypeError, match=msg):
+            DatetimeIndex(idx, dtype='datetime64[ns, CET]')
+        msg = "cannot supply both a tz and a dtype with a tz"
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(idx, tz='CET', dtype='datetime64[ns, US/Eastern]')
+
         result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]')
         tm.assert_index_equal(idx, result)
 
@@ -732,7 +741,9 @@ def test_from_freq_recreate_from_data(self, freq):
 
     def test_datetimeindex_constructor_misc(self):
         arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04']
-        pytest.raises(Exception, DatetimeIndex, arr)
+        msg = r"(\(u?')?Unknown string format(:', 'Jn 3, 2005'\))?"
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(arr)
 
         arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']
         idx1 = DatetimeIndex(arr)

diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -346,8 +346,10 @@ def test_compat_replace(self, f):
     def test_catch_infinite_loop(self):
         offset = offsets.DateOffset(minute=5)
         # blow up, don't loop forever
-        pytest.raises(Exception, date_range, datetime(2011, 11, 11),
-                      datetime(2011, 11, 12), freq=offset)
+        msg = "Offset <DateOffset: minute=5> did not increment date"
+        with pytest.raises(ValueError, match=msg):
+            date_range(datetime(2011, 11, 11), datetime(2011, 11, 12),
+                       freq=offset)
 
     @pytest.mark.parametrize('periods', (1, 2))
     def test_wom_len(self, periods):

diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py
@@ -190,7 +190,9 @@ def test_datetimeindex_accessors(self):
         # Ensure is_start/end accessors throw ValueError for CustomBusinessDay,
         bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu')
         dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt)
-        pytest.raises(ValueError, lambda: dti.is_month_start)
+        msg = "Custom business days is not supported by is_month_start"
+        with pytest.raises(ValueError, match=msg):
+            dti.is_month_start
 
         dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'])
 

diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
@@ -37,15 +37,19 @@ def test_ops_properties_basic(self):
 
         # sanity check that the behavior didn't change
         # GH#7206
+        msg = "'Series' object has no attribute '{}'"
         for op in ['year', 'day', 'second', 'weekday']:
-            pytest.raises(TypeError, lambda x: getattr(self.dt_series, op))
+            with pytest.raises(AttributeError, match=msg.format(op)):
+                getattr(self.dt_series, op)
 
         # attribute access should still work!
         s = Series(dict(year=2000, month=1, day=10))
         assert s.year == 2000
         assert s.month == 1
         assert s.day == 10
-        pytest.raises(AttributeError, lambda: s.weekday)
+        msg = "'Series' object has no attribute 'weekday'"
+        with pytest.raises(AttributeError, match=msg):
+            s.weekday
 
     def test_repeat_range(self, tz_naive_fixture):
         tz = tz_naive_fixture

diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py
@@ -170,7 +170,8 @@ def test_partial_slice(self):
         result = s['2005-1-1']
         assert result == s.iloc[0]
 
-        pytest.raises(Exception, s.__getitem__, '2004-12-31')
+        with pytest.raises(KeyError, match=r"^'2004-12-31'$"):
+            s['2004-12-31']
 
     def test_partial_slice_daily(self):
         rng = date_range(freq='H', start=datetime(2005, 1, 31), periods=500)
@@ -179,7 +180,8 @@ def test_partial_slice_daily(self):
         result = s['2005-1-31']
         tm.assert_series_equal(result, s.iloc[:24])
 
-        pytest.raises(Exception, s.__getitem__, '2004-12-31 00')
+        with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"):
+            s['2004-12-31 00']
 
     def test_partial_slice_hourly(self):
         rng = date_range(freq='T', start=datetime(2005, 1, 1, 20, 0, 0),
@@ -193,7 +195,8 @@ def test_partial_slice_hourly(self):
         tm.assert_series_equal(result, s.iloc[:60])
 
         assert s['2005-1-1 20:00'] == s.iloc[0]
-        pytest.raises(Exception, s.__getitem__, '2004-12-31 00:15')
+        with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"):
+            s['2004-12-31 00:15']
 
     def test_partial_slice_minutely(self):
         rng = date_range(freq='S', start=datetime(2005, 1, 1, 23, 59, 0),
@@ -207,7 +210,8 @@ def test_partial_slice_minutely(self):
         tm.assert_series_equal(result, s.iloc[:60])
 
         assert s[Timestamp('2005-1-1 23:59:00')] == s.iloc[0]
-        pytest.raises(Exception, s.__getitem__, '2004-12-31 00:00:00')
+        with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"):
+            s['2004-12-31 00:00:00']
 
     def test_partial_slice_second_precision(self):
         rng = date_range(start=datetime(2005, 1, 1, 0, 0, 59,
@@ -255,7 +259,9 @@ def test_partial_slicing_dataframe(self):
                 result = df['a'][ts_string]
                 assert isinstance(result, np.int64)
                 assert result == expected
-                pytest.raises(KeyError, df.__getitem__, ts_string)
+                msg = r"^'{}'$".format(ts_string)
+                with pytest.raises(KeyError, match=msg):
+                    df[ts_string]
 
             # Timestamp with resolution less precise than index
             for fmt in formats[:rnum]:
@@ -282,15 +288,20 @@ def test_partial_slicing_dataframe(self):
                 result = df['a'][ts_string]
                 assert isinstance(result, np.int64)
                 assert result == 2
-                pytest.raises(KeyError, df.__getitem__, ts_string)
+                msg = r"^'{}'$".format(ts_string)
+                with pytest.raises(KeyError, match=msg):
+                    df[ts_string]
 
             # Not compatible with existing key
             # Should raise KeyError
             for fmt, res in list(zip(formats, resolutions))[rnum + 1:]:
                 ts = index[1] + Timedelta("1 " + res)
                 ts_string = ts.strftime(fmt)
-                pytest.raises(KeyError, df['a'].__getitem__, ts_string)
-                pytest.raises(KeyError, df.__getitem__, ts_string)
+                msg = r"^'{}'$".format(ts_string)
+                with pytest.raises(KeyError, match=msg):
+                    df['a'][ts_string]
+                with pytest.raises(KeyError, match=msg):
+                    df[ts_string]
 
     def test_partial_slicing_with_multiindex(self):
 
@@ -316,11 +327,10 @@ def test_partial_slicing_with_multiindex(self):
 
         # this is an IndexingError as we don't do partial string selection on
         # multi-levels.
-        def f():
+        msg = "Too many indexers"
+        with pytest.raises(IndexingError, match=msg):
             df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')]
 
-        pytest.raises(IndexingError, f)
-
         # GH 4294
         # partial slice on a series mi
         s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range(