From e52754580d6e29546ba8b4a87f144a702e7a66d7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Thu, 7 Feb 2019 21:29:02 +0100 Subject: [PATCH 01/18] BUG: Fix type coercion in read_json orient='table' (#21345) --- pandas/io/json/json.py | 32 ++++++++++++++++++++++++----- pandas/tests/io/json/test_pandas.py | 6 ++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4bbccc8339d7c..8524a2138386c 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -226,7 +226,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, return serialized -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): @@ -277,9 +277,24 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, 'table' as an allowed value for the ``orient`` argument typ : type of object to recover (series or frame), default 'frame' - dtype : boolean or dict, default True - If True, infer dtypes, if a dict of column to dtype, then use those, + dtype : boolean or dict + If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. + + The allowed and default values depend on the value of the `orient` + parameter: + + - if ``orient != 'table'``: + + - allowed ``dtype`` values are True, False or a dict + - default is True + + - if ``orient == 'table'``: + + - allowed and default ``dtype`` is False + + .. versionchanged:: 0.24.2 set default False for ``orient='table'`` + convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True @@ -408,6 +423,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ + if dtype and orient == 'table': + raise ValueError("'dtype' is only valid when 'orient' is not 'table'") + compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -600,15 +618,19 @@ class Parser(object): 'us': long(31536000000000), 'ns': long(31536000000000000)} - def __init__(self, json, orient, dtype=True, convert_axes=True, + def __init__(self, json, orient, dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False, precise_float=False, date_unit=None): self.json = json if orient is None: orient = self._default_orient - self.orient = orient + + if orient == 'table': + dtype = False + if dtype is None: + dtype = True self.dtype = dtype if orient == "split": diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c5fcb9fb0f672..281876dce3c10 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1202,6 +1202,12 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + def test_from_json_to_json_table_dtypes(self): + expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = expected.to_json(orient='table') + result = pd.read_json(dfjson, orient='table') + assert_frame_equal(result, expected) + @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}), From 45faaba3de4dd8111886d589fb8b8706121cd64a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Thu, 7 Feb 2019 22:49:32 +0100 Subject: [PATCH 02/18] Fix and add tests --- pandas/tests/io/json/test_json_table_schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 6fa3b5b3b2ed4..3002d1dfb5f8a 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -502,12 +502,12 @@ class TestTableOrientReader(object): @pytest.mark.parametrize("vals", [ {'ints': [1, 2, 3, 4]}, {'objects': ['a', 'b', 'c', 'd']}, + {'objects': ['1', '2', '3', '4']}, {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)}, {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))}, {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], ordered=True))}, - pytest.param({'floats': [1., 2., 3., 4.]}, - marks=pytest.mark.xfail), + {'floats': [1., 2., 3., 4.]}, {'floats': [1.1, 2.2, 3.3, 4.4]}, {'bools': [True, False, False, True]}]) def test_read_json_table_orient(self, index_nm, vals, recwarn): From 7a084ad9340198c944c2132b65b6b78a3a31cf1b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Fri, 8 Feb 2019 07:33:18 +0100 Subject: [PATCH 03/18] Add GH issue number as a comment to the new test --- pandas/tests/io/json/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 281876dce3c10..d6ee872d69e5a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1203,6 +1203,7 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after def test_from_json_to_json_table_dtypes(self): + # GH21345 expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) dfjson = expected.to_json(orient='table') result = pd.read_json(dfjson, orient='table') From f39bbf955b091b23220e190eb27150474837215d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 10:38:31 +0100 Subject: [PATCH 04/18] Change docstring --- pandas/io/json/json.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 8524a2138386c..72cb39b17590d 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -277,23 +277,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, 'table' as an allowed value for the ``orient`` argument typ : type of object to recover (series or frame), default 'frame' - dtype : boolean or dict + dtype : boolean or dict, default True If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. - The allowed and default values depend on the value of the `orient` - parameter: + Not applicable with orient='table'. - - if ``orient != 'table'``: - - - allowed ``dtype`` values are True, False or a dict - - default is True - - - if ``orient == 'table'``: - - - allowed and default ``dtype`` is False - - .. versionchanged:: 0.24.2 set default False for ``orient='table'`` + .. versionchanged:: 0.25 + Not applicable with ``orient='table'`` convert_axes : boolean, default True Try to convert the axes to the proper dtypes. From 4f5c3b378a99146abd43e52469ce996af012abca Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 17:49:18 +0100 Subject: [PATCH 05/18] Raise error if dtype is not None and orient='table' --- pandas/io/json/json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 72cb39b17590d..277492474aed6 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -414,8 +414,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if dtype and orient == 'table': - raise ValueError("'dtype' is only valid when 'orient' is not 'table'") + if dtype is not None and orient == 'table': + raise ValueError("cannot pass both dtype and orient='table'") compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( From bc2c1dd4cfffa67e3a197e810cd9e14aa162c5f8 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 17:51:02 +0100 Subject: [PATCH 06/18] Test raised error if not None dtype and orient='table' --- pandas/tests/io/json/test_pandas.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d6ee872d69e5a..f6a7dea037576 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1209,6 +1209,15 @@ def test_from_json_to_json_table_dtypes(self): result = pd.read_json(dfjson, orient='table') assert_frame_equal(result, expected) + @pytest.mark.xfail(raises=ValueError) + @pytest.mark.parametrize('dtype', [True, False, {'b': int, 'c': int}]) + def test_error_read_json_table_dtype(self, dtype): + # GH21345 + expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = expected.to_json(orient='table') + result = pd.read_json(dfjson, orient='table', dtype=dtype) + assert_frame_equal(result, expected) + @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}), From a78cc2ed9ad8dd082e598c8ddc08ccb404efbc2d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 19:03:29 +0100 Subject: [PATCH 07/18] Move to read_json default setting dtype=False for 'table' --- pandas/io/json/json.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 277492474aed6..4595d30157722 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -414,8 +414,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if dtype is not None and orient == 'table': - raise ValueError("cannot pass both dtype and orient='table'") + if orient == 'table': + if dtype is None: + dtype = False + else: + raise ValueError("cannot pass both dtype and orient='table'") compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( @@ -618,8 +621,6 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, orient = self._default_orient self.orient = orient - if orient == 'table': - dtype = False if dtype is None: dtype = True self.dtype = dtype From 933eb518a28ca5d870d9e96074886c9326874fcc Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 19:14:21 +0100 Subject: [PATCH 08/18] Fix docstring formatting for rendering --- pandas/io/json/json.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4595d30157722..8be94fb174a06 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -281,10 +281,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. - Not applicable with orient='table'. + Not applicable with ``orient='table'``. .. versionchanged:: 0.25 - Not applicable with ``orient='table'`` + + Not applicable with ``orient='table'``. convert_axes : boolean, default True Try to convert the axes to the proper dtypes. From 67a857175edf63443161c02d7bb8896f84182d01 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 9 Feb 2019 19:24:51 +0100 Subject: [PATCH 09/18] Add whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index cbefae07b07f1..469c676f36bf2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -158,6 +158,7 @@ I/O ^^^ - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) +- Bug in :func:`read_json` doing type coercion for ``orient='table'`` (:issue:`21345`) - - - From b0b2410e8addbfdac370ad01a6405c44697f1938 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 13 Feb 2019 22:09:56 +0100 Subject: [PATCH 10/18] Fix default dtype depending on orient --- pandas/io/json/json.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 8be94fb174a06..4983ba86c77db 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -415,11 +415,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if orient == 'table': - if dtype is None: - dtype = False - else: - raise ValueError("cannot pass both dtype and orient='table'") + if orient == 'table' and dtype is not None: + raise ValueError("cannot pass both dtype and orient='table'") compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( @@ -623,7 +620,10 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, self.orient = orient if dtype is None: - dtype = True + if orient == 'table': + dtype = False + else: + dtype = True self.dtype = dtype if orient == "split": From 55adbc08e5a51baa51495f48d72f1df08c0b8a47 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 13 Feb 2019 22:29:43 +0100 Subject: [PATCH 11/18] Explain better in whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 469c676f36bf2..689a97fdbfc90 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -158,7 +158,7 @@ I/O ^^^ - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) -- Bug in :func:`read_json` doing type coercion for ``orient='table'`` (:issue:`21345`) +- Bug in :func:`read_json` that tries to infer dtypes by default, even for ``orient='table'`` which already defines dtypes in its schema (:issue:`21345`) - - - From 99074e3c41dbeb391192081d98aff6ce51230e76 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 20 Feb 2019 06:36:27 +0100 Subject: [PATCH 12/18] Address requested changes --- pandas/io/json/json.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4983ba86c77db..19a5353e64dea 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -619,12 +619,7 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, orient = self._default_orient self.orient = orient - if dtype is None: - if orient == 'table': - dtype = False - else: - dtype = True - self.dtype = dtype + self.dtype = orient != 'table' if dtype is None else dtype if orient == "split": numpy = False From d96b304001e12d93e0210a603b26fe5246604b20 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 20 Feb 2019 22:04:56 +0100 Subject: [PATCH 13/18] Make whatsnew note more clear --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c88b19f223f23..83addc91a772e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -158,7 +158,7 @@ I/O ^^^ - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) -- Bug in :func:`read_json` that tries to infer dtypes by default, even for ``orient='table'`` which already defines dtypes in its schema (:issue:`21345`) +- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - - - From bf98bf8026b171693fc2fb054b96d2bd49978c4f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 20 Feb 2019 22:16:57 +0100 Subject: [PATCH 14/18] Use pytest.raises --- pandas/tests/io/json/test_pandas.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ef8666861d5a7..98cde0102bb6d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1209,14 +1209,13 @@ def test_from_json_to_json_table_dtypes(self): result = pd.read_json(dfjson, orient='table') assert_frame_equal(result, expected) - @pytest.mark.xfail(raises=ValueError) @pytest.mark.parametrize('dtype', [True, False, {'b': int, 'c': int}]) - def test_error_read_json_table_dtype(self, dtype): + def test_read_json_table_dtype_raises(self, dtype): # GH21345 - expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) - dfjson = expected.to_json(orient='table') - result = pd.read_json(dfjson, orient='table', dtype=dtype) - assert_frame_equal(result, expected) + df = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = df.to_json(orient='table') + with pytest.raises(ValueError): + pd.read_json(dfjson, orient='table', dtype=dtype) @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), From 6325365d065e727b5006bdb91c4836202d089ce5 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Wed, 20 Feb 2019 23:07:51 +0100 Subject: [PATCH 15/18] Move default dtype setting to read_json --- pandas/io/json/json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 19a5353e64dea..5d10035a9b36f 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -418,6 +418,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, if orient == 'table' and dtype is not None: raise ValueError("cannot pass both dtype and orient='table'") + dtype = orient != 'table' if dtype is None else dtype + compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -619,8 +621,6 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, orient = self._default_orient self.orient = orient - self.dtype = orient != 'table' if dtype is None else dtype - if orient == "split": numpy = False From c0d348cbab5a04a61533a018a2f66365d94a25cb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Thu, 21 Feb 2019 06:49:01 +0100 Subject: [PATCH 16/18] Set assignment in Parser --- pandas/io/json/json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 5d10035a9b36f..39a767550905e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -621,6 +621,8 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, orient = self._default_orient self.orient = orient + self.dtype = dtype + if orient == "split": numpy = False From 900a3f0684ee13c4ff0b4e97a45429b4971e0ae9 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 23 Feb 2019 07:49:52 +0100 Subject: [PATCH 17/18] Allow orient=table and dtype=False --- pandas/io/json/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 39a767550905e..725e2d28ffd67 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -415,7 +415,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if orient == 'table' and dtype is not None: + if orient == 'table' and dtype: raise ValueError("cannot pass both dtype and orient='table'") dtype = orient != 'table' if dtype is None else dtype From 4ecf5c6f843591560af326b73929afbeb2585004 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 23 Feb 2019 07:53:51 +0100 Subject: [PATCH 18/18] Fix test --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 98cde0102bb6d..fecd0f0572757 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1209,7 +1209,7 @@ def test_from_json_to_json_table_dtypes(self): result = pd.read_json(dfjson, orient='table') assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', [True, False, {'b': int, 'c': int}]) + @pytest.mark.parametrize('dtype', [True, {'b': int, 'c': int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 df = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']})