### This script aims to develop a prompt chain structure to send large amounts of text/content to LLM APIs including through multiple calls

#### Note: still in experimentation mode/progress

In [None]:
# import packages
import base64
import json
from typing import List, Dict, Union, Optional
from dataclasses import dataclass
import os
from anthropic import Anthropic
import time
import threading
from IPython.display import Image
import math

### Overview

We created a module designed to handle the processing of large JSON documents and images through the Claude AI API. It solves the common problem of processing JSON files that exceed Claude's input size limits by chunking the data while maintaining context and structure.
    Processes large JSON files by automatically splitting them into manageable chunks
    Handles both single resources and collections of resources
    Integrates image processing capabilities
    Maintains context across chunked data
    Provides progress tracking during processing
    Combines multiple responses into coherent analysis

Key Methods

1) JSON Splitting (_split_json):
- Breaks large JSON files into processable chunks
- Preserves resource structure and relationships
- Adds metadata for context preservation


2) Message Preparation (_prepare_message):
- Formats data for Claude API
- Handles both JSON and image content
- Maintains chunk context
- Manages base64 encoding for images


3) JSON Processing (process_json):
- Main processing workflow
- Manages chunking and API calls
- Handles image integration
- Provides progress tracking
- Implements error handling


4) Response Combination (combine_responses):
- Merges chunk responses
- Creates coherent final analysis
- Maintains section separation
- Preserves context markers

The heartbeat function provides real-time feedback during processing:
- Shows elapsed time
- Indicates active processing
- Helps monitor long-running operation

#### Defining functions

In [None]:
"""
Claude Processor Module
----------------------
This module provides functionality to process large JSON documents and images through the Claude API
by breaking them into manageable chunks while maintaining context and structure.

Key Components:
1. ChunkMetadata - Tracks information about JSON chunks
2. ClaudeProcessor - Main class handling API interaction and data processing
3. Helper functions for progress tracking and response management
"""

@dataclass
class ChunkMetadata:
    """
    Metadata container for tracking JSON chunk information.
    
    Attributes:
        chunk_number (int): Sequential number of this chunk
        total_chunks (int): Total number of chunks in the complete dataset
        resource_type (str): FHIR resource type or 'Collection' for multiple types
        chunk_size (int): Size of the chunk in bytes
    """
    chunk_number: int
    total_chunks: int
    resource_type: str
    chunk_size: int



def heartbeat(stop_event: threading.Event, start_time: float) -> None:
    """
    Provides visual feedback about processing progress.
    
    Args:
        stop_event (threading.Event): Event to signal when processing is complete
        start_time (float): Timestamp when processing started
    
    Prints elapsed time every 5 seconds until the stop event is set.
    """
    while not stop_event.is_set():
        elapsed = time.time() - start_time
        print(f"... still processing ({elapsed:.1f}s elapsed)")
        time.sleep(5)

class ClaudeProcessor:
    """
    Main class for processing large JSON documents and images through the Claude API.
    
    Handles:
    - Breaking large JSONs into manageable chunks
    - Processing images alongside JSON data
    - Managing API interactions
    - Combining responses into coherent analysis
    
    Attributes:
        client (Anthropic): Authenticated Claude API client
        model_name (str): Claude model to use for processing
        max_chunk_size (int): Maximum size in bytes for each chunk
        max_tokens (int): Maximum tokens in Claude's response
    """
    
    def __init__(
        self,
        api_key: str,
        model_name: str = "claude-3-5-sonnet-20240620",
        max_chunk_size: int = 100000,
        max_tokens: int = 4096
    ):
        """
        Initialize the Claude Processor.
        
        Args:
            api_key (str): Claude API authentication key
            model_name (str): Claude model version to use
            max_chunk_size (int): Maximum bytes per chunk
            max_tokens (int): Maximum tokens in response
        """
        self.client = Anthropic(api_key=api_key)
        self.model_name = model_name
        self.max_chunk_size = max_chunk_size
        self.max_tokens = max_tokens
        
    def _split_json(self, json_data: Union[Dict, List]) -> List[Dict]:
        """
        Splits large JSON objects into processable chunks.
        
        Args:
            json_data (Union[Dict, List]): Input JSON data
            
        Returns:
            List[Dict]: List of chunks with metadata
            
        Handles both single resources and collections while preserving structure.
        Each chunk includes metadata for context preservation.
        """
        if isinstance(json_data, dict):
            # Handle single resource
            resource_type = json_data.get('resourceType', 'Unknown')
            entries = json_data.get('entry', [json_data])
        else:
            # Handle array of resources
            resource_type = 'Collection'
            entries = json_data
            
        chunks = []
        current_chunk = []
        current_size = 0
        
        for entry in entries:
            entry_size = len(json.dumps(entry))
            
            if current_size + entry_size > self.max_chunk_size and current_chunk:
                chunks.append(current_chunk)
                current_chunk = []
                current_size = 0
                
            current_chunk.append(entry)
            current_size += entry_size
            
        if current_chunk:
            chunks.append(current_chunk)
            
        return [{
            'resourceType': resource_type,
            'total': len(chunk),
            'entry': chunk,
            'metadata': ChunkMetadata(
                chunk_number=i+1,
                total_chunks=len(chunks),
                resource_type=resource_type,
                chunk_size=len(json.dumps(chunk))
            )
        } for i, chunk in enumerate(chunks)]

    def _prepare_message(
        self,
        chunk: Dict,
        prompt: str,
        image_paths: Optional[List[str]] = None
    ) -> List[Dict]:
        """
        Prepares messages for Claude API including images and JSON content.
        
        Args:
            chunk (Dict): JSON chunk to process
            prompt (str): Analysis instructions for Claude
            image_paths (Optional[List[str]]): Paths to images to include
            
        Returns:
            List[Dict]: Formatted message for Claude API
            
        Handles both text and image content, maintaining chunk context.
        """
        content = []
        
        # Add any images first
        if image_paths:
            for img_path in image_paths:
                with open(img_path, "rb") as img_file:
                    base64_image = base64.b64encode(img_file.read()).decode('utf-8')
                    content.append({
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": f"image/{os.path.splitext(img_path)[1][1:]}",
                            "data": base64_image
                        }
                    })

        # Add the JSON chunk and prompt
        metadata = chunk.pop('metadata')
        chunk_context = f"""This is chunk {metadata.chunk_number} of {metadata.total_chunks} 
        from a {metadata.resource_type} resource.\n\n"""
        
        content.append({
            "type": "text",
            "text": f"{chunk_context}JSON Content:\n{json.dumps(chunk, indent=2)}\n\n{prompt}"
        })

        return [{"role": "user", "content": content}]

    def process_json(
        self,
        json_data: Union[Dict, List],
        prompt: str,
        image_paths: Optional[List[str]] = None
    ) -> List[str]:
        """
        Main method for processing JSON data with optional images.
        
        Args:
            json_data (Union[Dict, List]): JSON content to analyze
            prompt (str): Instructions for Claude's analysis
            image_paths (Optional[List[str]]): Images to include
            
        Returns:
            List[str]: List of Claude's responses for each chunk
            
        Orchestrates the entire processing workflow including:
        - Chunking large JSONs
        - Adding images
        - Managing API calls
        - Error handling
        - Progress tracking
        """
        chunks = self._split_json(json_data)
        responses = []
        
        for chunk in chunks:
            start_time = time.time()
            stop_event = threading.Event()
            heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
            heartbeat_thread.start()

            try:
                messages = self._prepare_message(chunk, prompt, image_paths)
                response = self.client.messages.create(
                    model=self.model_name,
                    max_tokens=self.max_tokens,
                    messages=messages
                )
                responses.append(response.content[0].text)
                
            except Exception as e:
                print(f"Error processing chunk {chunk['metadata'].chunk_number}: {str(e)}")
                raise
            finally:
                stop_event.set()
                heartbeat_thread.join()

        return responses

    def combine_responses(self, responses: List[str]) -> str:
        """
        Combines multiple chunk responses into a unified analysis.
        
        Args:
            responses (List[str]): Individual chunk responses
            
        Returns:
            str: Combined analysis with clear section separation
            
        Creates a coherent final document while preserving chunk boundaries
        for reference and context.
        """
        combined = "=== Combined Analysis ===\n\n"
        
        for i, response in enumerate(responses, 1):
            combined += f"=== Chunk {i} Analysis ===\n{response}\n\n"
            
        return combined

### Running the processor

In [None]:
# Initialize the processor
processor = ClaudeProcessor(
    api_key=os.getenv('ANTHROPIC_API_KEY'),
    model_name="claude-3-5-sonnet-20240620"
)

# Load your JSON data
with open('path_to_your_json.json', 'r') as f:
    json_data = json.load(f)

# List of image paths to include
image_paths = ['path_to_image1.jpg', 'path_to_image2.png']

# Define your prompt
prompt = """Please analyze this portion of the Implementation Guide and provide:
1. Key information and requirements
2. Notable patterns or constraints
3. How this section relates to the overall IG

Focus on new information not covered in previous chunks."""

# Process the JSON and get responses
responses = processor.process_json(json_data, prompt, image_paths)

# Combine all responses into a final analysis
final_analysis = processor.combine_responses(responses)
print(final_analysis)