In [0]:
nested_data = "/Volumes/mgiglia/dev_matthew_giglia_price_transparency/landing/in-network/2025-08_040_05C0_in-network-rates_1_of_5.json.gz"
output_dir = "/Volumes/mgiglia/dev_matthew_giglia_price_transparency/landing/in-network/split/"

In [0]:
import ijson
import json
import gzip
import os
from pathlib import Path
from typing import Dict, List, Any, Optional
from decimal import Decimal

In [0]:
class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, Decimal):
            return float(o)
        return super().default(o)

# When writing JSON, use:
# json.dump(
#     data,
#     file,
#     cls=DecimalEncoder,
#     indent=2,
#     ensure_ascii=False
# )

In [0]:
class NestedJSONGZSplitter:
    """
    Advanced splitter for nested JSON structures in compressed files.
    """
    
    def __init__(self, input_file: str, output_dir: str):
        self.input_file = input_file
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.is_compressed = input_file.endswith('.gz')
    
    def analyze_structure(self, max_depth=3) -> Dict[str, Any]:
        """
        Analyze the JSON structure to help identify splitting paths.
        """
        print(f"Analyzing structure of {Path(self.input_file).name}...")
        
        file_opener = gzip.open if self.is_compressed else open
        structure_info = {
            'root_type': None,
            'possible_paths': [],
            'sample_data': {}
        }
        
        with file_opener(self.input_file, 'rb') as file:
            try:
                # Try to determine root structure
                events = ijson.parse(file)
                depth = 0
                current_path = []
                array_paths = []
                
                for prefix, event, value in events:
                    if event == 'start_array':
                        if prefix:
                            array_paths.append(f"{prefix}.item")
                        else:
                            array_paths.append("item")
                            structure_info['root_type'] = 'array'
                    elif event == 'start_map' and not prefix:
                        structure_info['root_type'] = 'object'
                    
                    # Limit analysis to avoid processing entire file
                    if len(array_paths) > 10:
                        break
                
                structure_info['possible_paths'] = array_paths[:10]
                
            except Exception as e:
                print(f"Error analyzing structure: {e}")
        
        return structure_info
    
    def split_nested_array(self, json_path: str, chunk_size: int = 1000, 
                          preserve_structure: bool = False) -> None:
        """
        Split nested arrays with option to preserve parent structure.
        
        Args:
            json_path: IJSon path like 'data.records.item' or 'users.item'
            chunk_size: Objects per output file
            preserve_structure: If True, maintains the nested structure in output
        """
        print(f"Splitting nested path: {json_path}")
        
        file_opener = gzip.open if self.is_compressed else open
        current_chunk = []
        file_counter = 1
        total_objects = 0
        
        with file_opener(self.input_file, 'rb') as file:
            parser = ijson.items(file, json_path)
            
            for obj in parser:
                current_chunk.append(obj)
                total_objects += 1
                
                if total_objects % 5000 == 0:
                    print(f"Processed {total_objects} objects from {json_path}")
                
                if len(current_chunk) >= chunk_size:
                    if preserve_structure:
                        self._write_structured_chunk(current_chunk, json_path, file_counter)
                    else:
                        self._write_flat_chunk(current_chunk, file_counter)
                    
                    current_chunk = []
                    file_counter += 1
            
            # Write final chunk
            if current_chunk:
                if preserve_structure:
                    self._write_structured_chunk(current_chunk, json_path, file_counter)
                else:
                    self._write_flat_chunk(current_chunk, file_counter)
        
        print(f"Split {total_objects} objects into {file_counter} files")
    
    def split_multiple_paths(self, path_configs: List[Dict[str, Any]]) -> None:
        """
        Split multiple nested paths into separate output directories.
        
        Args:
            path_configs: List of dicts with 'path', 'output_subdir', 'chunk_size'
        """
        file_opener = gzip.open if self.is_compressed else open
        
        # Initialize parsers and counters for each path
        path_data = {}
        for config in path_configs:
            path_data[config['path']] = {
                'config': config,
                'chunk': [],
                'file_counter': 1,
                'total_objects': 0,
                'output_dir': os.path.join(self.output_dir, config.get('output_subdir', config['path'].replace('.', '_')))
            }
            os.makedirs(path_data[config['path']]['output_dir'], exist_ok=True)
        
        with file_opener(self.input_file, 'rb') as file:
            # Process each path separately
            for path, data in path_data.items():
                print(f"Processing path: {path}")
                file.seek(0)  # Reset file position
                
                parser = ijson.items(file, path)
                chunk_size = data['config'].get('chunk_size', 1000)
                
                for obj in parser:
                    data['chunk'].append(obj)
                    data['total_objects'] += 1
                    
                    if len(data['chunk']) >= chunk_size:
                        self._write_chunk_to_dir(
                            data['chunk'], 
                            data['output_dir'], 
                            data['file_counter']
                        )
                        data['chunk'] = []
                        data['file_counter'] += 1
                
                # Write final chunk
                if data['chunk']:
                    self._write_chunk_to_dir(
                        data['chunk'], 
                        data['output_dir'], 
                        data['file_counter']
                    )
                
                print(f"Path {path}: {data['total_objects']} objects in {data['file_counter']} files")
    
    def split_by_nested_field(self, json_path: str, split_field: str, 
                             chunk_size: int = 1000) -> None:
        """
        Split objects based on a nested field value.
        
        Args:
            json_path: Path to the array items
            split_field: Field to use for splitting (can be nested like 'user.type')
            chunk_size: Objects per file per category
        """
        file_opener = gzip.open if self.is_compressed else open
        categories = {}
        
        with file_opener(self.input_file, 'rb') as file:
            parser = ijson.items(file, json_path)
            
            for obj in parser:
                # Extract split field value (supports nested fields)
                field_value = self._get_nested_field(obj, split_field)
                category = str(field_value) if field_value is not None else 'null'
                
                if category not in categories:
                    categories[category] = {
                        'objects': [],
                        'file_counter': 1,
                        'total_count': 0
                    }
                
                categories[category]['objects'].append(obj)
                categories[category]['total_count'] += 1
                
                # Write chunk if it's full
                if len(categories[category]['objects']) >= chunk_size:
                    self._write_category_chunk(
                        categories[category]['objects'],
                        category,
                        categories[category]['file_counter']
                    )
                    categories[category]['objects'] = []
                    categories[category]['file_counter'] += 1
        
        # Write remaining objects
        for category, data in categories.items():
            if data['objects']:
                self._write_category_chunk(
                    data['objects'],
                    category,
                    data['file_counter']
                )
            print(f"Category '{category}': {data['total_count']} objects")
    
    def _get_nested_field(self, obj: Dict, field_path: str) -> Any:
        """Get value from nested field path like 'user.profile.type'."""
        try:
            value = obj
            for field in field_path.split('.'):
                value = value[field]
            return value
        except (KeyError, TypeError):
            return None
    
    def _write_flat_chunk(self, chunk: List, file_number: int) -> None:
        """Write chunk as flat array."""
        output_file = os.path.join(self.output_dir, f'chunk_{file_number:04d}.json')
        with open(output_file, 'w', encoding='utf-8') as out_file:
            json.dump(chunk, out_file, cls=DecimalEncoder, separators=(',', ':'), ensure_ascii=False)
        print(f"Written flat chunk {file_number}: {len(chunk)} objects")
    
    def _write_structured_chunk(self, chunk: List, json_path: str, file_number: int) -> None:
        """Write chunk preserving nested structure."""
        # Reconstruct nested structure
        path_parts = json_path.replace('.item', '').split('.')
        nested_data = chunk
        
        # Build nested structure from inside out
        for part in reversed(path_parts):
            nested_data = {part: nested_data}
        
        output_file = os.path.join(self.output_dir, f'structured_{file_number:04d}.json')
        with open(output_file, 'w', encoding='utf-8') as out_file:
            json.dump(nested_data, out_file, cls=DecimalEncoder, indent=2, ensure_ascii=False)
        print(f"Written structured chunk {file_number}: {len(chunk)} objects")
    
    def _write_chunk_to_dir(self, chunk: List, output_dir: str, file_number: int) -> None:
        """Write chunk to specific directory."""
        output_file = os.path.join(output_dir, f'chunk_{file_number:04d}.json')
        with open(output_file, 'w', encoding='utf-8') as out_file:
            json.dump(chunk, out_file, cls=DecimalEncoder, separators=(',', ':'), ensure_ascii=False)
    
    def _write_category_chunk(self, chunk: List, category: str, file_number: int) -> None:
        """Write chunk for specific category."""
        safe_category = "".join(c for c in category if c.isalnum() or c in (' ', '-', '_')).strip()
        category_dir = os.path.join(self.output_dir, f'category_{safe_category}')
        os.makedirs(category_dir, exist_ok=True)
        
        output_file = os.path.join(category_dir, f'{safe_category}_{file_number:04d}.json')
        with open(output_file, 'w', encoding='utf-8') as out_file:
            json.dump(chunk, out_file, cls=DecimalEncoder ,separators=(',', ':'), ensure_ascii=False)

In [0]:
# # Example usage functions
# def example_nested_structures():
#     """Examples of handling different nested JSON structures."""
    
#     # Example 1: Simple nested array
#     # JSON structure: {"data": {"items": [...]}}
#     splitter1 = NestedJSONGZSplitter('data_with_items.json.gz', 'output1')
#     splitter1.split_nested_array('data.items.item', chunk_size=500)
    
#     # Example 2: Multiple nested arrays
#     # JSON structure: {"users": [...], "posts": [...], "comments": [...]}
#     splitter2 = NestedJSONGZSplitter('multi_data.json.gz', 'output2')
#     path_configs = [
#         {'path': 'users.item', 'output_subdir': 'users', 'chunk_size': 1000},
#         {'path': 'posts.item', 'output_subdir': 'posts', 'chunk_size': 500},
#         {'path': 'comments.item', 'output_subdir': 'comments', 'chunk_size': 2000}
#     ]
#     splitter2.split_multiple_paths(path_configs)
    
#     # Example 3: Deep nesting
#     # JSON structure: {"api": {"v1": {"responses": {"data": [...]}}}}
#     splitter3 = NestedJSONGZSplitter('deep_nested.json.gz', 'output3')
#     splitter3.split_nested_array('api.v1.responses.data.item', chunk_size=100)
    
#     # Example 4: Split by nested field
#     # Split based on user.account_type field
#     splitter4 = NestedJSONGZSplitter('user_data.json.gz', 'output4')
#     splitter4.split_by_nested_field('users.item', 'account.type', chunk_size=500)

In [0]:
# if __name__ == "__main__":
#     # Analyze structure first
#     splitter = NestedJSONGZSplitter('example.json.gz', 'output')
#     structure_info = splitter.analyze_structure()
#     print("Structure analysis:", structure_info)
    
#     # Run examples
#     example_nested_structures()

In [0]:
# # Example usage
# if __name__ == "__main__":
#     # For nested JSON like: {"data": {"records": [...]}}
#     split_nested_json('nested_data.json', 'output_nested', 'data.records.item', 100)
    
#     # Split by category field
#     split_by_category('categorized_data.json', 'output_categories', 'type')

In [0]:
splitter = NestedJSONGZSplitter(
  input_file = nested_data
  ,output_dir = output_dir
)

In [0]:
structure = splitter.analyze_structure()
structure

In [0]:
splitter.split_by_nested_field(
  json_path = 'provider_references.item.provider_groups.item'
  ,split_field = 'npi'
  ,chunk_size = 1000
)

In [0]:
# --- Distributed JSON Splitter using Spark ---
# This example demonstrates how to use Spark to process and split large nested JSON files in parallel.
# It is designed for Databricks/Azure and can handle compressed files (e.g., .json.gz) natively.
#
# Adjust the paths and field names as needed for your data structure.

from pyspark.sql import functions as F
from pyspark.sql.types import *

# Example input paths (update as needed)
input_path = nested_data  # e.g., '/Volumes/.../in-network-rates_1_of_5.json.gz'
output_base = output_dir # e.g., '/Volumes/.../split/'

# 1. Read the JSON file (Spark can read .gz files directly)
df = spark.read.option("multiline", True).json(input_path)

# 2. Inspect schema to find nested arrays
# df.printSchema()

# 3. Explode nested arrays (example: 'provider_references' is an array)
# Adjust the explode path as needed for your JSON structure
if 'provider_references' in df.columns:
    exploded = df.select(F.explode('provider_references').alias('provider_reference'))
else:
    # If the array is nested deeper, adjust accordingly
    # exploded = df.select(F.explode('data.items').alias('item'))
    raise ValueError("Update the explode path for your JSON structure.")

# 4. Optionally, flatten nested fields for easier splitting
flat = exploded.select(
    'provider_reference.*'  # expand all fields of provider_reference
)

# 5. Split by a nested field (example: 'provider_groups')
# If provider_groups is an array, explode it; otherwise, use as is
if 'provider_groups' in flat.columns:
    flat = flat.withColumn('provider_group', F.explode_outer('provider_groups'))
else:
    flat = flat.withColumn('provider_group', F.lit(None))

# 6. Write out the results partitioned by provider_group (distributed write)
# This will create one folder per provider_group value
# flat.write \
#     .mode('overwrite') \
#     .partitionBy('provider_group') \
#     .json(f"{output_base}/by_provider_group")

# 7. (Optional) If you want to split by chunk size instead, repartition and write
chunk_size = 1000
num_partitions = max(1, flat.count() // chunk_size)
flat.repartition(num_partitions).write.mode('overwrite').json(f"{output_base}/by_chunk")

# ---
# Notes:
# - Adjust the explode/select logic for your actual nested structure.
# - For very large files, Spark will process and write in parallel across the cluster.
# - You can use .parquet() instead of .json() for more efficient storage.
# - For more complex splits (e.g., multiple fields), use .partitionBy with multiple columns.
# - If you need to preserve more of the original structure, select/alias fields as needed before writing.

# New code chunk demonstrating a Spark-native distributed splitter for nested JSON
# This section provides an example of how to handle nested JSON data using Spark's DataFrame API.

# 1. Read the nested JSON file
nested_df = spark.read.json(input_path)

# 2. Explode the nested field (e.g., 'items' within 'provider_references')
if 'provider_references' in nested_df.columns:
    exploded_nested = nested_df.select(F.explode('provider_references.items').alias('item'))
else:
    raise ValueError("The specified nested field does not exist.")

# 3. Flatten the exploded DataFrame to access nested fields
flattened_df = exploded_nested.select(
    'item.*'  # Select all fields from the exploded item
)

# 4. Write the flattened DataFrame to output, partitioned by a specific field if needed
flattened_df.write \
    .mode('overwrite') \
    .partitionBy('some_field') \  # Replace 'some_field' with the actual field to partition by
    .json(f"{output_base}/flattened_output")

# 5. (Optional) Repartition the DataFrame for better performance
# flattened_df.repartition(100).write.mode('overwrite').json(f"{output_base}/repartitioned_output")

# ---
# This code chunk provides a basic framework for processing nested JSON data using Spark.
# Adjust the field names and paths according to your specific data structure and requirements.