BUG: fix json segfaults

closes #11473 closes #10778 closes #11299 Author: Kieran O'Mahony <kieranom@gmail.com> Closes #12802 from Komnomnomnom/json-seg-faults and squashes the following commits: b14d0df [Kieran O'Mahony] CLN: rename json test inline with others af006a4 [Kieran O'Mahony] BUG: fix json segfaults
pandas-dev · Apr 26, 2016 · 37a7e69 · 37a7e69
1 parent c33eb36
commit 37a7e69
Show file tree

Hide file tree

Showing 9 changed files with 284 additions and 42 deletions.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -149,6 +149,7 @@ Other Enhancements
 - ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
 - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
 - ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`)
+- ``.to_json()`` now supports ``NDFrames`` that contain categorical and sparse data (:issue:`10778`)
 - ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`).
 - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
 - ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`)
@@ -398,8 +399,6 @@ Deprecations
 
 
 
-
-
 .. _whatsnew_0181.performance:
 
 Performance Improvements
@@ -443,6 +442,11 @@ Bug Fixes
 - Bug in correctly raising a ``ValueError`` in ``.resample(..).fillna(..)`` when passing a non-string (:issue:`12952`)
 - Bug fixes in various encoding and header processing issues in ``pd.read_sas()`` (:issue:`12659`, :issue:`12654`, :issue:`12647`, :issue:`12809`)
 - Bug in ``pd.crosstab()`` where would silently ignore ``aggfunc`` if ``values=None`` (:issue:`12569`).
+- Potential segfault in ``DataFrame.to_json`` when serialising ``datetime.time`` (:issue:`11473`).
+- Potential segfault in ``DataFrame.to_json`` when attempting to serialise 0d array (:issue:`11299`).
+- Segfault in ``to_json`` when attempting to serialise a ``DataFrame`` or ``Series`` with non-ndarray values (:issue:`10778`).
+
+
 
 
 - Bug in consistency of ``.name`` on ``.groupby(..).apply(..)`` cases (:issue:`12363`)

diff --git a/pandas/io/tests/test_json/__init__.py → pandas/io/tests/json/__init__.py b/pandas/io/tests/test_json/__init__.py → pandas/io/tests/json/__init__.py
diff --git a/...ests/test_json/data/tsframe_iso_v012.json → .../io/tests/json/data/tsframe_iso_v012.json b/...ests/test_json/data/tsframe_iso_v012.json → .../io/tests/json/data/tsframe_iso_v012.json
diff --git a/...io/tests/test_json/data/tsframe_v012.json → pandas/io/tests/json/data/tsframe_v012.json b/...io/tests/test_json/data/tsframe_v012.json → pandas/io/tests/json/data/tsframe_v012.json
diff --git a/pandas/io/tests/test_json_norm.py → pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/test_json_norm.py → pandas/io/tests/json/test_json_norm.py
diff --git a/pandas/io/tests/test_json/test_pandas.py → pandas/io/tests/json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py → pandas/io/tests/json/test_pandas.py
@@ -821,6 +821,99 @@ def my_handler_raises(obj):
                           DataFrame({'a': [1, 2, object()]}).to_json,
                           default_handler=my_handler_raises)
 
+    def test_categorical(self):
+        # GH4377 df.to_json segfaults with non-ndarray blocks
+        df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
+        df["B"] = df["A"]
+        expected = df.to_json()
+
+        df["B"] = df["A"].astype('category')
+        self.assertEqual(expected, df.to_json())
+
+        s = df["A"]
+        sc = df["B"]
+        self.assertEqual(s.to_json(), sc.to_json())
+
+    def test_datetime_tz(self):
+        # GH4377 df.to_json segfaults with non-ndarray blocks
+        tz_range = pd.date_range('20130101', periods=3, tz='US/Eastern')
+        tz_naive = tz_range.tz_convert('utc').tz_localize(None)
+
+        df = DataFrame({
+            'A': tz_range,
+            'B': pd.date_range('20130101', periods=3)})
+
+        df_naive = df.copy()
+        df_naive['A'] = tz_naive
+        expected = df_naive.to_json()
+        self.assertEqual(expected, df.to_json())
+
+        stz = Series(tz_range)
+        s_naive = Series(tz_naive)
+        self.assertEqual(stz.to_json(), s_naive.to_json())
+
+    def test_sparse(self):
+        # GH4377 df.to_json segfaults with non-ndarray blocks
+        df = pd.DataFrame(np.random.randn(10, 4))
+        df.ix[:8] = np.nan
+
+        sdf = df.to_sparse()
+        expected = df.to_json()
+        self.assertEqual(expected, sdf.to_json())
+
+        s = pd.Series(np.random.randn(10))
+        s.ix[:8] = np.nan
+        ss = s.to_sparse()
+
+        expected = s.to_json()
+        self.assertEqual(expected, ss.to_json())
+
+    def test_tz_is_utc(self):
+        exp = '"2013-01-10T05:00:00.000Z"'
+
+        ts = Timestamp('2013-01-10 05:00:00Z')
+        self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True))
+        dt = ts.to_datetime()
+        self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True))
+
+        ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern')
+        self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True))
+        dt = ts.to_datetime()
+        self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True))
+
+        ts = Timestamp('2013-01-10 00:00:00-0500')
+        self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True))
+        dt = ts.to_datetime()
+        self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True))
+
+    def test_tz_range_is_utc(self):
+        exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
+        dfexp = ('{"DT":{'
+                 '"0":"2013-01-01T05:00:00.000Z",'
+                 '"1":"2013-01-02T05:00:00.000Z"}}')
+
+        tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2)
+        self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True))
+        dti = pd.DatetimeIndex(tz_range)
+        self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True))
+        df = DataFrame({'DT': dti})
+        self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
+
+        tz_range = pd.date_range('2013-01-01 00:00:00', periods=2,
+                                 tz='US/Eastern')
+        self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True))
+        dti = pd.DatetimeIndex(tz_range)
+        self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True))
+        df = DataFrame({'DT': dti})
+        self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
+
+        tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2)
+        self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True))
+        dti = pd.DatetimeIndex(tz_range)
+        self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True))
+        df = DataFrame({'DT': dti})
+        self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True))
+
 
 if __name__ == '__main__':
     import nose

diff --git a/pandas/io/tests/test_json/test_ujson.py → pandas/io/tests/json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py → pandas/io/tests/json/test_ujson.py
@@ -23,7 +23,6 @@
 import numpy as np
 from numpy.testing import (assert_array_almost_equal_nulp,
                            assert_approx_equal)
-import pytz
 from pandas import DataFrame, Series, Index, NaT, DatetimeIndex
 import pandas.util.testing as tm
 
@@ -365,15 +364,30 @@ def test_encodeTimeConversion(self):
             datetime.time(),
             datetime.time(1, 2, 3),
             datetime.time(10, 12, 15, 343243),
-            datetime.time(10, 12, 15, 343243, pytz.utc),
-            # datetime.time(10, 12, 15, 343243, dateutil.tz.gettz('UTC')),  #
-            # this segfaults! No idea why.
         ]
         for test in tests:
             output = ujson.encode(test)
             expected = '"%s"' % test.isoformat()
             self.assertEqual(expected, output)
 
+    def test_encodeTimeConversion_pytz(self):
+        # GH11473 to_json segfaults with timezone-aware datetimes
+        tm._skip_if_no_pytz()
+        import pytz
+        test = datetime.time(10, 12, 15, 343243, pytz.utc)
+        output = ujson.encode(test)
+        expected = '"%s"' % test.isoformat()
+        self.assertEqual(expected, output)
+
+    def test_encodeTimeConversion_dateutil(self):
+        # GH11473 to_json segfaults with timezone-aware datetimes
+        tm._skip_if_no_dateutil()
+        import dateutil
+        test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc())
+        output = ujson.encode(test)
+        expected = '"%s"' % test.isoformat()
+        self.assertEqual(expected, output)
+
     def test_nat(self):
         input = NaT
         assert ujson.encode(input) == 'null', "Expected null"