In [0]:
import gzip
import ijson
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType

In [0]:
def stream_and_split_gzipped_json(input_path, output_prefix, chunk_size=1000):
    if not isinstance(input_path, str):
        raise TypeError("input_path must be a string")
    # Open gzipped file for streaming parsing
    with gzip.open(input_path, 'rb') as f:
        objects = ijson.items(f, 'root.item')  # adjust 'root.item' depending on root type
        batch = []
        file_num = 0
        for obj in objects:
            batch.append(obj)
            if len(batch) >= chunk_size:
                with open(f"{output_prefix}_part_{file_num}.json", "w") as out_f:
                    json.dump(batch, out_f)
                batch = []
                file_num += 1
        if batch:  # flush leftovers
            with open(f"{output_prefix}_part_{file_num}.json", "w") as out_f:
                json.dump(batch, out_f)

In [0]:
def process_partition(pdf):
    results = []
    for path in pdf["file_path"]:
        if isinstance(path, str):
            stream_and_split_gzipped_json(path, "/Volumes/mgiglia/dev_matthew_giglia_price_transparency/landing/in-network/output_partitioned", 1000)
            results.append({"file_path": path, "status": "done"})
        else:
            results.append({"file_path": str(path), "status": "error: path is not a string"})
    return pd.DataFrame(results)

In [0]:
# On Databricks, using Spark with Arrow optimization:
input_file_paths = ["/Volumes/mgiglia/dev_matthew_giglia_price_transparency/landing/in-network/2025-08_040_05C0_in-network-rates_1_of_5.json.gz"]  # list of file paths to process

In [0]:
schema = StructType([
    StructField("file_path", StringType()),
    StructField("status", StringType())
])
df = spark.createDataFrame([(path,) for path in input_file_paths], ["file_path"]).mapInArrow(
    process_partition,
    schema
)

In [0]:
display(df)