From 784ad59440d8da7d122616ceb4c541018453d366 Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sat, 25 Oct 2025 16:41:28 -0500 Subject: [PATCH 1/3] BUG: Handle non-dict items in json_normalize with max_level --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/json/_normalize.py | 5 ++++- pandas/tests/io/json/test_normalize.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7027b9b26ebb..99c14f84ea4a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1107,6 +1107,7 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) +- Bug in :func:`pandas.json_normalize` raising ``AttributeError`` when ``max_level`` was set and the input data contained ``NaN`` values (:issue:`62829`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 16ec73ddeb743..4545865b1ec81 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -117,6 +117,9 @@ def nested_to_record( singleton = True new_ds = [] for d in ds: + if not isinstance(d, dict): + new_ds.append({}) + continue new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix @@ -517,7 +520,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: return DataFrame(_simple_json_normalize(data, sep=sep), index=index) if record_path is None: - if any([isinstance(x, dict) for x in y.values()] for y in data): + if any(isinstance(y, dict) for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index b6212b514673f..cb73b943b1544 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -511,6 +511,19 @@ def test_max_level_with_records_path(self, max_level, expected): expected_df = DataFrame(data=expected, columns=result.columns.values) tm.assert_equal(expected_df, result) + def test_json_normalize_max_level_with_nan(self): + # GH 62829 - test for bug where max_level=0 fails with nan in input list + d = { + 1: {"id": 10, "status": "AVAL"}, + 2: {"id": 30, "status": "AVAL", "items": {"id": 12, "size": 20}}, + 3: {"id": 50, "status": "AVAL", "items": {"id": 13, "size": 30}}, + } + df = DataFrame.from_dict(d, orient="index") + data_list = df["items"].tolist() + expected = DataFrame({"id": [np.nan, 12.0, 13.0], "size": [np.nan, 20.0, 30.0]}) + result = json_normalize(data_list, max_level=0) + tm.assert_frame_equal(result, expected) + def test_nested_flattening_consistent(self): # see gh-21537 df1 = json_normalize([{"A": {"B": 1}}]) From 6ebda259a10e1116f595e5ede8105d70b83c251e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 21:57:04 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 99c14f84ea4a2..5087fce369977 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1092,6 +1092,7 @@ I/O - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) - Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`) +- Bug in :func:`pandas.json_normalize` raising ``AttributeError`` when ``max_level`` was set and the input data contained ``NaN`` values (:issue:`62829`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) @@ -1107,7 +1108,6 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) -- Bug in :func:`pandas.json_normalize` raising ``AttributeError`` when ``max_level`` was set and the input data contained ``NaN`` values (:issue:`62829`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) From 3ec415fccfafbd21cf6ba388db40ccdb6a4bc151 Mon Sep 17 00:00:00 2001 From: parthava-adabala Date: Sat, 25 Oct 2025 16:41:28 -0500 Subject: [PATCH 3/3] BUG: Raise TypeError for non-dict items in json_normalize --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/json/_normalize.py | 12 ++++++++---- pandas/tests/io/json/test_normalize.py | 22 ++++++++++------------ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5087fce369977..bae1334398fcb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1092,7 +1092,7 @@ I/O - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) - Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`) -- Bug in :func:`pandas.json_normalize` raising ``AttributeError`` when ``max_level`` was set and the input data contained ``NaN`` values (:issue:`62829`) +- Bug in :func:`pandas.json_normalize` inconsistently handling non-dict items in ``data`` when ``max_level`` was set. The function will now raise a ``TypeError`` if ``data`` is a list containing non-dict items (:issue:`62829`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 4545865b1ec81..dc03d9141a4e9 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -117,9 +117,6 @@ def nested_to_record( singleton = True new_ds = [] for d in ds: - if not isinstance(d, dict): - new_ds.append({}) - continue new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix @@ -504,6 +501,13 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: # GH35923 Fix pd.json_normalize to not skip the first element of a # generator input data = list(data) + for item in data: + if not isinstance(item, dict): + msg = ( + "All items in data must be of type dict, " + f"found {type(item).__name__}" + ) + raise TypeError(msg) else: raise NotImplementedError @@ -520,7 +524,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: return DataFrame(_simple_json_normalize(data, sep=sep), index=index) if record_path is None: - if any(isinstance(y, dict) for y in data): + if any([isinstance(x, dict) for x in y.values()] for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index cb73b943b1544..f03fd235fef85 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -511,18 +511,16 @@ def test_max_level_with_records_path(self, max_level, expected): expected_df = DataFrame(data=expected, columns=result.columns.values) tm.assert_equal(expected_df, result) - def test_json_normalize_max_level_with_nan(self): - # GH 62829 - test for bug where max_level=0 fails with nan in input list - d = { - 1: {"id": 10, "status": "AVAL"}, - 2: {"id": 30, "status": "AVAL", "items": {"id": 12, "size": 20}}, - 3: {"id": 50, "status": "AVAL", "items": {"id": 13, "size": 30}}, - } - df = DataFrame.from_dict(d, orient="index") - data_list = df["items"].tolist() - expected = DataFrame({"id": [np.nan, 12.0, 13.0], "size": [np.nan, 20.0, 30.0]}) - result = json_normalize(data_list, max_level=0) - tm.assert_frame_equal(result, expected) + def test_json_normalize_non_dict_items(self): + # gh-62829 + data_list = [np.nan, {"id": 12}, {"id": 13}] + msg = "All items in data must be of type dict, found float" + + with pytest.raises(TypeError, match=msg): + json_normalize(data_list, max_level=0) + + with pytest.raises(TypeError, match=msg): + json_normalize(data_list) def test_nested_flattening_consistent(self): # see gh-21537