In [1]:
import orjson
import json
import uuid
import random
import os

In [2]:

def generate_json_file(filename, target_size_mb):
    """
    Generates a JSON file of approximately target_size_mb.
    Uses a streaming approach to keep memory usage low.
    """
    target_bytes = target_size_mb * 1024 * 1024
    roles = ['admin', 'user', 'editor', 'guest', 'support']
    
    print(f"Generating {filename} ({target_size_mb} MB)...")

    with open(filename, 'w', encoding='utf-8') as f:
        f.write('[\n')  # Start JSON array
        
        current_size = f.tell()
        first_record = True
        record_count = 0

        while current_size < target_bytes:
            # Generate a "CRUD" style database record
            record = {
                "id": record_count,
                "uuid": str(uuid.uuid4()),
                "username": f"user_name_{record_count}",
                "email": f"contact_{record_count}@example-domain.com",
                "profile": {
                    "first_name": random.choice(['John', 'Jane', 'Alex', 'Max', 'Sarah']),
                    "last_name": random.choice(['Smith', 'Doe', 'Johnson', 'Brown', 'Lee']),
                    "bio": "This is a repeating string used to fill up space and simulate a longer text field in a database. " * 3,
                },
                "role": random.choice(roles),
                "is_active": random.choice([True, False]),
                "permissions": ["read", "write", "delete"] if record_count % 10 == 0 else ["read"],
                "created_at": "2023-10-27T10:00:00Z",
                "updated_at": "2024-01-15T14:30:22Z"
            }

            # Handle comma placement for valid JSON
            json_str = json.dumps(record, indent=2)
            if not first_record:
                f.write(',\n')
            
            f.write(json_str)
            
            first_record = False
            record_count += 1
            
            # Update current file size
            # We flush occasionally to get an accurate file size check
            if record_count % 100 == 0:
                f.flush()
                current_size = os.path.getsize(filename)

        f.write('\n]')  # End JSON array

    final_size = os.path.getsize(filename) / (1024 * 1024)
    print(f"Finished! Actual size: {final_size:.2f} MB | Total Records: {record_count}\n")

In [3]:
generate_json_file('large_data.json', 5000)

Generating large_data.json (5000 MB)...
Finished! Actual size: 5000.05 MB | Total Records: 7405800



In [11]:
# json vs orjson reading benchmarks (memory-aware)
import time
import gc
import os

pid = os.getpid()
print(f"Process ID: {pid}")

def benchmark_json_full_read(filename):
    """Full parse with stdlib json (high memory)."""
    start_time = time.time()
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    elapsed = time.time() - start_time
    print(f"json.load (full) took {elapsed:.2f} seconds | records: {len(data):,}")

    del data
    gc.collect()

def benchmark_orjson_full_read(filename):
    """Full parse with orjson (high memory)."""
    start_time = time.time()
    with open(filename, 'rb') as f:
        data = orjson.loads(f.read())
    elapsed = time.time() - start_time
    print(f"orjson.loads (full) took {elapsed:.2f} seconds | records: {len(data):,}")

    del data
    gc.collect()

def iter_json_array_objects(filename, chunk_size_mb=8):
    """
    Stream top-level objects from a JSON array file as raw bytes.
    Keeps memory low by not loading the whole file.
    Assumes top-level is: [ { ... }, { ... }, ... ]
    """
    chunk_size = chunk_size_mb * 1024 * 1024
    in_string = False
    escape = False
    brace_depth = 0
    collecting = False
    obj_buffer = bytearray()

    with open(filename, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break

            for b in chunk:
                if not collecting:
                    # Wait until the start of the next object
                    if b == ord('{'):
                        collecting = True
                        brace_depth = 1
                        obj_buffer = bytearray([b])
                    continue

                obj_buffer.append(b)

                if in_string:
                    if escape:
                        escape = False
                    elif b == ord('\\'):
                        escape = True
                    elif b == ord('"'):
                        in_string = False
                    continue

                if b == ord('"'):
                    in_string = True
                elif b == ord('{'):
                    brace_depth += 1
                elif b == ord('}'):
                    brace_depth -= 1
                    if brace_depth == 0:
                        yield bytes(obj_buffer)
                        collecting = False
                        obj_buffer = bytearray()

def benchmark_json_streaming_read(filename, chunk_size_mb=8, limit=None):
    """
    Streaming parse using json.loads per object.
    Low memory; suitable for huge JSON arrays.
    """
    start_time = time.time()
    count = 0

    for obj_bytes in iter_json_array_objects(filename, chunk_size_mb=chunk_size_mb):
        _ = json.loads(obj_bytes.decode('utf-8'))
        count += 1
        if limit is not None and count >= limit:
            break

    elapsed = time.time() - start_time
    print(f"json streaming took {elapsed:.2f} seconds | objects parsed: {count:,}")
    gc.collect()

def benchmark_orjson_streaming_read(filename, chunk_size_mb=8, limit=None):
    """
    Streaming parse using orjson.loads per object.
    Low memory; suitable for huge JSON arrays.
    """
    start_time = time.time()
    count = 0

    for obj_bytes in iter_json_array_objects(filename, chunk_size_mb=chunk_size_mb):
        _ = orjson.loads(obj_bytes)
        count += 1
        if limit is not None and count >= limit:
            break

    elapsed = time.time() - start_time
    print(f"orjson streaming took {elapsed:.2f} seconds | objects parsed: {count:,}")
    gc.collect()

# RECOMMENDED for very large files (low memory):
benchmark_json_streaming_read('large_data.json', chunk_size_mb=8)
benchmark_orjson_streaming_read('large_data.json', chunk_size_mb=8)

# Optional full-memory benchmarks (may OOM on very large files):
# benchmark_json_full_read('large_data.json')
# benchmark_orjson_full_read('large_data.json')

Process ID: 9516
json streaming took 709.48 seconds | objects parsed: 14,786,500
orjson streaming took 694.89 seconds | objects parsed: 14,786,500
