From 163873fbb34db9545a2e707c64d670dcb965e97a Mon Sep 17 00:00:00 2001 From: Chandrasekar Sivaraman Date: Thu, 20 Feb 2025 00:45:11 +0100 Subject: [PATCH 1/2] updated read_json function to auto infer orient from the json schema if its table or split --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/json/_json.py | 21 +++++++++++++++++++ pandas/tests/io/json/test_pandas.py | 32 +++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4d9a45abe17cd..da09d18c49ce4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -70,6 +70,7 @@ Other enhancements - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :meth:`pandas.read_json` now automatically infers the ``orient`` parameter if it is not explicitly specified. This allows the correct format to be detected based on the input JSON structure. This only works if json schema matches for split or table. (:issue:`52713`). - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e032e26d771d7..7d26bf3988429 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -6,6 +6,8 @@ ) from collections import abc from itertools import islice +import json +import os from typing import ( TYPE_CHECKING, Any, @@ -559,6 +561,12 @@ def read_json( - ``'values'`` : just the values array - ``'table'`` : dict like ``{{'schema': {{schema}}, 'data': {{data}}}}`` + **Automatic Orient Inference for split or table**: + If the `orient` parameter is not specified, + this function will automatically infer the correct JSON format. + This works only if the schema matches for a table or split. + If the json was created using to_json with orient=split or orient=table + The allowed and default values depend on the value of the `typ` parameter. @@ -768,6 +776,19 @@ def read_json( 0 0 1 2.5 True a 1577.2 1 1 4.5 False b 1577.1 """ + if orient is None: + if isinstance(path_or_buf, (str, bytes, os.PathLike)): + with open(path_or_buf, encoding="utf-8") as f: + json_data = json.load(f) + else: + json_data = json.load(path_or_buf) + + if isinstance(json_data, dict): + if "schema" in json_data and "data" in json_data: + orient = "table" + elif "columns" in json_data and "data" in json_data: + orient = "split" + if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") if orient == "table" and convert_axes: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..84179afddefb7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2283,3 +2283,35 @@ def test_large_number(): ) expected = Series([9999999999999999]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "json_data, should_fail", + [ + ( + json.dumps( + { + "schema": {"fields": [{"name": "A", "type": "integer"}]}, + "data": [{"A": 1}, {"A": 2}, {"A": 3}], + } + ), + False, + ), + (json.dumps({"columns": ["A"], "data": [[1], [2], [3]]}), False), + ], +) +def test_read_json_auto_infer(json_data, should_fail, tmp_path): + """Test pd.read_json auto-infers 'table' and 'split' formats.""" + + # Use tmp_path to create a temporary file + temp_file = tmp_path / "test_read_json.json" + + # Write the json_data to the temporary file + with open(temp_file, "w") as f: + f.write(json_data) + + if should_fail: + with pytest.raises(ValueError, match=".*expected.*"): + read_json(temp_file) + else: + read_json(temp_file) From 673c8f8687a7a3f73928580e9b670a80109198e5 Mon Sep 17 00:00:00 2001 From: Chandrasekar Sivaraman Date: Thu, 20 Feb 2025 00:47:03 +0100 Subject: [PATCH 2/2] updated read_json function to auto infer orient from the json schema if its table or split --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 84179afddefb7..68b31ee8487bb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2300,7 +2300,7 @@ def test_large_number(): (json.dumps({"columns": ["A"], "data": [[1], [2], [3]]}), False), ], ) -def test_read_json_auto_infer(json_data, should_fail, tmp_path): +def test_read_json_auto_infer_orient_table_split(json_data, should_fail, tmp_path): """Test pd.read_json auto-infers 'table' and 'split' formats.""" # Use tmp_path to create a temporary file