From 379d209348fc5c1590f929d70884a6785cf41c33 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 15 Nov 2025 16:41:58 -0300 Subject: [PATCH] test(io): add test to read binary jsonlines in chunks --- pandas/tests/io/json/test_readlines.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index f9c25f425a878..6e65a11792391 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,5 +1,8 @@ from collections.abc import Iterator -from io import StringIO +from io import ( + BytesIO, + StringIO, +) from pathlib import Path import numpy as np @@ -121,10 +124,15 @@ def test_to_jsonl_count_new_lines(): @pytest.mark.parametrize("chunksize", [1, 1.0]) -def test_readjson_chunks(request, lines_json_df, chunksize, engine): +@pytest.mark.parametrize("buffer", [BytesIO, StringIO]) +def test_readjson_chunks(request, lines_json_df, chunksize, buffer, engine): # Basic test that read_json(chunks=True) gives the same result as # read_json(chunks=False) # GH17048: memory usage when lines=True + # GH#28906: read binary json lines in chunks + + if buffer == BytesIO: + lines_json_df = lines_json_df.encode() if engine == "pyarrow": # GH 48893 @@ -134,10 +142,11 @@ def test_readjson_chunks(request, lines_json_df, chunksize, engine): ) request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) - unchunked = read_json(StringIO(lines_json_df), lines=True) - with read_json( - StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine - ) as reader: + unchunked = read_json(buffer(lines_json_df), lines=True) + with ( + buffer(lines_json_df) as buf, + read_json(buf, lines=True, chunksize=chunksize, engine=engine) as reader, + ): chunked = pd.concat(reader) tm.assert_frame_equal(chunked, unchunked)