Merge pull request #10322 from evanpw/json

Bug in to_json causing segfault with a CategoricalIndex (GH #10317)
pandas-dev · Jun 10, 2015 · 07ea11c · 07ea11c
2 parents ba69a49 + 588437c
commit 07ea11c
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 36 deletions.
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
@@ -120,6 +120,7 @@ Bug Fixes
 - Bug where read_hdf store.select modifies the passed columns list when
   multi-indexed (:issue:`7212`)
 - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
+- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10317`)
 
 - Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)
 

diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -4,7 +4,7 @@
 import os
 
 import numpy as np
-from pandas import Series, DataFrame, DatetimeIndex, Timestamp
+from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex
 from datetime import timedelta
 import pandas as pd
 read_json = pd.read_json
@@ -23,6 +23,11 @@
                            for k, v in compat.iteritems(_seriesd)))
 
 _tsframe = DataFrame(_tsd)
+_cat_frame = _frame.copy()
+cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15)
+_cat_frame.index = pd.CategoricalIndex(cat,name='E')
+_cat_frame['E'] = list(reversed(cat))
+_cat_frame['sort'] = np.arange(len(_cat_frame))
 
 _mixed_frame = _frame.copy()
 
@@ -48,6 +53,7 @@ def setUp(self):
         self.intframe = _intframe.copy()
         self.tsframe = _tsframe.copy()
         self.mixed_frame = _mixed_frame.copy()
+        self.categorical = _cat_frame.copy()
 
     def tearDown(self):
         del self.dirpath
@@ -128,8 +134,22 @@ def _check(df):
 
     def test_frame_from_json_to_json(self):
         def _check_orient(df, orient, dtype=None, numpy=False,
-                          convert_axes=True, check_dtype=True, raise_ok=None):
-            df = df.sort()
+                          convert_axes=True, check_dtype=True, raise_ok=None,
+                          sort=None):
+            if sort is not None:
+                df = df.sort(sort)
+            else:
+                df = df.sort()
+
+            # if we are not unique, then check that we are raising ValueError
+            # for the appropriate orients
+            if not df.index.is_unique and orient in ['index','columns']:
+                self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
+                return
+            if not df.columns.is_unique and orient in ['index','columns','records']:
+                self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
+                return
+
             dfjson = df.to_json(orient=orient)
 
             try:
@@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                         return
                     raise
 
-            unser = unser.sort()
+            if sort is not None and sort in unser.columns:
+                unser = unser.sort(sort)
+            else:
+                unser = unser.sort()
 
             if dtype is False:
                 check_dtype=False
@@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                 # index and col labels might not be strings
                 unser.index = [str(i) for i in unser.index]
                 unser.columns = [str(i) for i in unser.columns]
-                unser = unser.sort()
+
+                if sort is None:
+                    unser = unser.sort()
                 assert_almost_equal(df.values, unser.values)
             else:
                 if convert_axes:
@@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                     assert_frame_equal(df, unser, check_less_precise=False,
                                        check_dtype=check_dtype)
 
-        def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
+        def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None):
 
             # numpy=False
             if convert_axes:
-                _check_orient(df, "columns", dtype=dtype)
-                _check_orient(df, "records", dtype=dtype)
-                _check_orient(df, "split", dtype=dtype)
-                _check_orient(df, "index", dtype=dtype)
-                _check_orient(df, "values", dtype=dtype)
-
-            _check_orient(df, "columns", dtype=dtype, convert_axes=False)
-            _check_orient(df, "records", dtype=dtype, convert_axes=False)
-            _check_orient(df, "split", dtype=dtype, convert_axes=False)
-            _check_orient(df, "index", dtype=dtype, convert_axes=False)
-            _check_orient(df, "values", dtype=dtype ,convert_axes=False)
+                _check_orient(df, "columns", dtype=dtype, sort=sort)
+                _check_orient(df, "records", dtype=dtype, sort=sort)
+                _check_orient(df, "split", dtype=dtype, sort=sort)
+                _check_orient(df, "index", dtype=dtype, sort=sort)
+                _check_orient(df, "values", dtype=dtype, sort=sort)
+
+            _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort)
 
             # numpy=True and raise_ok might be not None, so ignore the error
             if convert_axes:
                 _check_orient(df, "columns", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "records", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "split", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "index", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "values", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
 
             _check_orient(df, "columns", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "records", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "split", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "index", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "values", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
 
         # basic
         _check_all_orients(self.frame)
@@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
         _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
                            convert_axes=False, raise_ok=ValueError)
 
+        # categorical
+        _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
+
         # empty
         _check_all_orients(self.empty_frame)
 

diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c
@@ -1814,7 +1814,7 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in
 
 void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc)
 {
-  PyObject *obj, *exc, *toDictFunc, *tmpObj;
+  PyObject *obj, *exc, *toDictFunc, *tmpObj, *getValuesFunc;
   TypeContext *pc;
   PyObjectEncoder *enc;
   double val;
@@ -2082,14 +2082,25 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc)
       return;
     }
 
-    PRINTMARK();
-    tc->type = JT_ARRAY;
-    pc->newObj = PyObject_GetAttrString(obj, "values");
-    pc->iterBegin = NpyArr_iterBegin;
-    pc->iterEnd = NpyArr_iterEnd;
-    pc->iterNext = NpyArr_iterNext;
-    pc->iterGetValue = NpyArr_iterGetValue;
-    pc->iterGetName = NpyArr_iterGetName;
+    PyObject* getValuesFunc = PyObject_GetAttrString(obj, "get_values");
+    if (getValuesFunc)
+    {
+      PRINTMARK();
+      tc->type = JT_ARRAY;
+      pc->newObj = PyObject_CallObject(getValuesFunc, NULL);
+      pc->iterBegin = NpyArr_iterBegin;
+      pc->iterEnd = NpyArr_iterEnd;
+      pc->iterNext = NpyArr_iterNext;
+      pc->iterGetValue = NpyArr_iterGetValue;
+      pc->iterGetName = NpyArr_iterGetName;
+
+      Py_DECREF(getValuesFunc);
+    }
+    else
+    {
+      goto INVALID;
+    }
+
     return;
   }
   else