Skip to content
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ MultiIndex
I/O
^^^
- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
-

Expand Down
42 changes: 25 additions & 17 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,14 +380,31 @@ def _json_normalize(
Returns normalized data with columns prefixed with the given string.
"""

def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable:
def _pull_field(
js: dict[str, Any], spec: list | str, extract_record: bool = False
) -> Scalar | Iterable:
"""Internal function to pull field"""
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
try:
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
except KeyError as e:
if extract_record:
raise KeyError(
f"Key {e} not found. If specifying a record_path, all elements of "
f"data should have the path."
) from e
elif errors == "ignore":
return np.nan
else:
raise KeyError(
f"Key {e} not found. To replace missing values of {e} with "
f"np.nan, pass in errors='ignore'"
) from e

return result

def _pull_records(js: dict[str, Any], spec: list | str) -> list:
Expand All @@ -396,7 +413,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
_pull_field, but require to return list. And will raise error
if has non iterable value.
"""
result = _pull_field(js, spec)
result = _pull_field(js, spec, extract_record=True)

# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
# null, otherwise return an empty list
Expand Down Expand Up @@ -488,16 +505,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == "ignore":
meta_val = np.nan
else:
raise KeyError(
"Try running with errors='ignore' as key "
f"{e} is not always present"
) from e
meta_val = _pull_field(obj, val[level:])
meta_vals[key].append(meta_val)
records.extend(recs)

Expand Down
44 changes: 41 additions & 3 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def missing_metadata():
"zip": 44646,
}
],
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
},
{
"addresses": [
Expand All @@ -115,7 +116,8 @@ def missing_metadata():
"state": "TN",
"zip": 37643,
}
]
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
]

Expand Down Expand Up @@ -598,7 +600,10 @@ def test_json_normalize_errors(self, missing_metadata):
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented

msg = "Try running with errors='ignore' as key 'name' is not always present"
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
Expand All @@ -618,11 +623,44 @@ def test_missing_meta(self, missing_metadata):
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
]
columns = ["city", "number", "state", "street", "zip", "name"]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
# greater than one is passed in
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="raise",
)

def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="ignore",
)
ex_data = [
["Foo York City", "Alice"],
["Barmingham", np.nan],
]
columns = ["city_name", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_donot_drop_nonevalues(self):
# GH21356
data = [
Expand Down