### This script aims to develop a prompt chain structure to send large amounts of text/content to LLM APIs including through multiple calls

#### Note: still in experimentation mode/progress

In [None]:
# import packages
import base64
import json
from typing import List, Dict, Union, Optional
from dataclasses import dataclass
import os
from anthropic import Anthropic
import time
import threading
from IPython.display import Image
import math
import os
#import google.generativeai as gemini
#from openai import OpenAI
import io, threading, time, re
import pandas as pd
from json_repair import repair_json
from langchain_community.document_loaders import BSHTMLLoader
import shutil
from dotenv import load_dotenv
import httpx
from collections import defaultdict

### Pulling in files of interest

In [None]:
source_folder = 'full-ig/site'
destination_folder = 'full-ig/json_only'

In [None]:
# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

In [None]:
json_files = []
for file_name in os.listdir(source_folder):
    # Check if the file ends with .html but not with compound extensions
    if file_name.endswith('.json'):
                                    # and not (file_name.endswith('.ttl.html') or 
                                            #  file_name.endswith('.json.html') or 
                                            #  file_name.endswith('.xml.html') or 
                                            #  file_name.endswith('.change.history.html')):
        json_files.append(file_name)
        # Move the file to the destination folder
        shutil.copy(os.path.join(source_folder, file_name), destination_folder)

In [None]:
def group_files_by_base_name(directory_path, delimiter='-'):
    """
    Group files in the directory by their base name (portion before a delimiter).
    
    Args:
    directory_path (str): Path to the directory containing files.
    delimiter (str): The delimiter to split the file name on (default is '-').

    Returns:
    dict: A dictionary where keys are base names and values are lists of files that share the same base name.
    """
    grouped_files = defaultdict(list)
    
    # Iterate through the files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):  # Only process .txt files
            if delimiter in filename:  # Only consider files with the delimiter
                # Get the base name (before the first delimiter)
                base_name = filename.split(delimiter)[0]
                
                # Append the file to the group corresponding to its base name
                grouped_files[base_name].append(filename)
    
    return grouped_files


In [None]:
grouped_files = group_files_by_base_name(directory_path)

In [None]:
for base_name, files in grouped_files.items():
    print(f"Base name: {base_name} (Total files: {len(files)})")

In [None]:
def copy_files_to_folders(directory_path, grouped_files):
    """
    Copy files to folders if the base name group has more than 1 file, and remove them from the original directory.
    
    Args:
    directory_path (str): Path to the directory containing files.
    grouped_files (dict): Dictionary of grouped files by base name.
    """
    for base_name, files in grouped_files.items():
        if len(files) >= 1:  # Only process groups with more than 1 file
            # Create a folder for the base name in the same directory
            base_folder = os.path.join(directory_path, base_name)
            if not os.path.exists(base_folder):
                os.makedirs(base_folder)  # Create the folder if it doesn't exist
            print(f"Created folder: {base_folder}")
            
            # Copy each file in the group to the new folder
            for file in files:
                source_file = os.path.join(directory_path, file)
                destination_file = os.path.join(base_folder, file)
                shutil.copy(source_file, destination_file)  # Copy the file
                # print(f"Copied {file} to {base_folder}")
                
                # Remove the file from the original directory
                # os.remove(source_file)
                # print(f"Removed {file} from original directory")

In [None]:
copy_files_to_folders(directory_path, grouped_files)

#### Consolidating JSONs

In [None]:
def combine_json_files(folder_path):
    """
    Combines all JSON files in a folder into a single array of JSON objects.
    
    Args:
        folder_path (str): Path to the folder containing JSON files
        
    Returns:
        list: List of JSON objects from all files
    """
    combined_json = []
    
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as file:
                    json_content = json.load(file)
                    combined_json.append(json_content)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON from {filename}: {e}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                
    return combined_json

def create_consolidated_jsons(base_directory='package/json_only'):
    """
    Creates consolidated JSON files for each subdirectory.
    
    Args:
        base_directory (str): Base directory containing the categorized folders
    """
    # Get all subdirectories
    subdirs = [d for d in os.listdir(base_directory) 
              if os.path.isdir(os.path.join(base_directory, d))]
    
    # Process each subdirectory
    for subdir in subdirs:
        folder_path = os.path.join(base_directory, subdir)
        print(f"Processing {subdir}...")
        
        # Combine all JSON files in this folder
        combined_data = combine_json_files(folder_path)
        
        if combined_data:
            # Create output filename
            output_filename = f"{subdir}_combined.json"
            output_path = os.path.join(base_directory, output_filename)
            
            # Write the combined JSON to a file
            try:
                with open(output_path, 'w') as outfile:
                    json.dump({
                        "resourceType": subdir,
                        "total": len(combined_data),
                        "entry": combined_data
                    }, outfile, indent=2)
                print(f"Created {output_filename} with {len(combined_data)} entries")
            except Exception as e:
                print(f"Error writing {output_filename}: {e}")

In [None]:
# Create the consolidated JSON files
create_consolidated_jsons()

#### Creating mixed processor

In [None]:
# """
# Claude Processor Module
# ----------------------
# This module provides functionality to process large JSON documents and images through the Claude API
# by breaking them into manageable chunks while maintaining context and structure.

# Key Components:
# 1. ChunkMetadata - Tracks information about JSON chunks
# 2. ClaudeProcessor - Main class handling API interaction and data processing
# 3. Helper functions for progress tracking and response management
# """

# @dataclass
# class ChunkMetadata:
#     """
#     Metadata container for tracking JSON chunk information.
    
#     Attributes:
#         chunk_number (int): Sequential number of this chunk
#         total_chunks (int): Total number of chunks in the complete dataset
#         resource_type (str): FHIR resource type or 'Collection' for multiple types
#         chunk_size (int): Size of the chunk in bytes
#     """
#     chunk_number: int
#     total_chunks: int
#     resource_type: str
#     chunk_size: int



# def heartbeat(stop_event: threading.Event, start_time: float) -> None:
#     """
#     Provides visual feedback about processing progress.
    
#     Args:
#         stop_event (threading.Event): Event to signal when processing is complete
#         start_time (float): Timestamp when processing started
    
#     Prints elapsed time every 5 seconds until the stop event is set.
#     """
#     while not stop_event.is_set():
#         elapsed = time.time() - start_time
#         print(f"... still processing ({elapsed:.1f}s elapsed)")
#         time.sleep(5)

# class ClaudeProcessor:
#     """
#     Main class for processing large JSON documents and images through the Claude API.
    
#     Handles:
#     - Breaking large JSONs into manageable chunks
#     - Processing images alongside JSON data
#     - Managing API interactions
#     - Combining responses into coherent analysis
    
#     Attributes:
#         client (Anthropic): Authenticated Claude API client
#         model_name (str): Claude model to use for processing
#         max_chunk_size (int): Maximum size in bytes for each chunk
#         max_tokens (int): Maximum tokens in Claude's response
#     """
    
#     def __init__(
#         self,
#         api_key: str,
#         model_name: str = "claude-3-5-sonnet-20240620",
#         max_chunk_size: int = 100000,
#         max_tokens: int = 4096
#     ):
#         """
#         Initialize the Claude Processor.
        
#         Args:
#             api_key (str): Claude API authentication key
#             model_name (str): Claude model version to use
#             max_chunk_size (int): Maximum bytes per chunk
#             max_tokens (int): Maximum tokens in response
#         """
#         self.client = Anthropic(api_key=api_key)
#         self.model_name = model_name
#         self.max_chunk_size = max_chunk_size
#         self.max_tokens = max_tokens
        
#     def _split_json(self, json_data: Union[Dict, List]) -> List[Dict]:
#         """
#         Splits large JSON objects into processable chunks.
        
#         Args:
#             json_data (Union[Dict, List]): Input JSON data
            
#         Returns:
#             List[Dict]: List of chunks with metadata
            
#         Handles both single resources and collections while preserving structure.
#         Each chunk includes metadata for context preservation.
#         """
#         if isinstance(json_data, dict):
#             # Handle single resource
#             resource_type = json_data.get('resourceType', 'Unknown')
#             entries = json_data.get('entry', [json_data])
#         else:
#             # Handle array of resources
#             resource_type = 'Collection'
#             entries = json_data
            
#         chunks = []
#         current_chunk = []
#         current_size = 0
        
#         for entry in entries:
#             entry_size = len(json.dumps(entry))
            
#             if current_size + entry_size > self.max_chunk_size and current_chunk:
#                 chunks.append(current_chunk)
#                 current_chunk = []
#                 current_size = 0
                
#             current_chunk.append(entry)
#             current_size += entry_size
            
#         if current_chunk:
#             chunks.append(current_chunk)
            
#         return [{
#             'resourceType': resource_type,
#             'total': len(chunk),
#             'entry': chunk,
#             'metadata': ChunkMetadata(
#                 chunk_number=i+1,
#                 total_chunks=len(chunks),
#                 resource_type=resource_type,
#                 chunk_size=len(json.dumps(chunk))
#             )
#         } for i, chunk in enumerate(chunks)]

#     def _prepare_message(
#         self,
#         chunk: Dict,
#         prompt: str,
#         image_paths: Optional[List[str]] = None
#     ) -> List[Dict]:
#         """
#         Prepares messages for Claude API including images and JSON content.
        
#         Args:
#             chunk (Dict): JSON chunk to process
#             prompt (str): Analysis instructions for Claude
#             image_paths (Optional[List[str]]): Paths to images to include
            
#         Returns:
#             List[Dict]: Formatted message for Claude API
            
#         Handles both text and image content, maintaining chunk context.
#         """
#         content = []
        
#         # Add any images first
#         if image_paths:
#             for img_path in image_paths:
#                 with open(img_path, "rb") as img_file:
#                     base64_image = base64.b64encode(img_file.read()).decode('utf-8')
#                     content.append({
#                         "type": "image",
#                         "source": {
#                             "type": "base64",
#                             "media_type": f"image/{os.path.splitext(img_path)[1][1:]}",
#                             "data": base64_image
#                         }
#                     })

#         # Add the JSON chunk and prompt
#         metadata = chunk.pop('metadata')
#         chunk_context = f"""This is chunk {metadata.chunk_number} of {metadata.total_chunks} 
#         from a {metadata.resource_type} resource.\n\n"""
        
#         content.append({
#             "type": "text",
#             "text": f"{chunk_context}JSON Content:\n{json.dumps(chunk, indent=2)}\n\n{prompt}"
#         })

#         return [{"role": "user", "content": content}]

#     def process_json(
#         self,
#         json_data: Union[Dict, List],
#         prompt: str,
#         image_paths: Optional[List[str]] = None
#     ) -> List[str]:
#         """
#         Main method for processing JSON data with optional images.
        
#         Args:
#             json_data (Union[Dict, List]): JSON content to analyze
#             prompt (str): Instructions for Claude's analysis
#             image_paths (Optional[List[str]]): Images to include
            
#         Returns:
#             List[str]: List of Claude's responses for each chunk
            
#         Orchestrates the entire processing workflow including:
#         - Chunking large JSONs
#         - Adding images
#         - Managing API calls
#         - Progress tracking
#         """
#         chunks = self._split_json(json_data)
#         responses = []
        
#         for chunk in chunks:
#             start_time = time.time()
#             stop_event = threading.Event()
#             heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
#             heartbeat_thread.start()

#             try:
#                 messages = self._prepare_message(chunk, prompt, image_paths)
#                 response = self.client.messages.create(
#                     model=self.model_name,
#                     max_tokens=self.max_tokens,
#                     messages=messages
#                 )
#                 responses.append(response.content[0].text)
                
#             except Exception as e:
#                 print(f"Error processing chunk {chunk['metadata'].chunk_number}: {str(e)}")
#                 raise
#             finally:
#                 stop_event.set()
#                 heartbeat_thread.join()

#         return responses

#     def combine_responses(self, responses: List[str]) -> str:
#         """
#         Combines multiple chunk responses into a unified analysis.
        
#         Args:
#             responses (List[str]): Individual chunk responses
            
#         Returns:
#             str: Combined analysis with clear section separation
            
#         Creates a coherent final document while preserving chunk boundaries
#         for reference and context.
#         """
#         combined = "=== Combined Analysis ===\n\n"
        
#         for i, response in enumerate(responses, 1):
#             combined += f"=== Chunk {i} Analysis ===\n{response}\n\n"
            
#         return combined

We created a module designed to handle the processing of large JSON documents and images through the Claude AI API. It solves the common problem of processing JSON files that exceed Claude's input size limits by chunking the data while maintaining context and structure.
    Processes large JSON files by automatically splitting them into manageable chunks
    Handles both single resources and collections of resources
    Integrates image processing capabilities
    Maintains context across chunked data
    Provides progress tracking during processing
    Combines multiple responses into coherent analysis

Key Methods

1) JSON Splitting (_split_json):
- Breaks large JSON files into processable chunks
- Preserves resource structure and relationships
- Adds metadata for context preservation


2) Message Preparation (_prepare_message):
- Formats data for Claude API
- Handles both JSON and image content
- Maintains chunk context
- Manages base64 encoding for images


3) JSON Processing (process_json):
- Main processing workflow
- Manages chunking and API calls
- Handles image integration
- Provides progress tracking
- Implements error handling


4) Response Combination (combine_responses):
- Merges chunk responses
- Creates coherent final analysis
- Maintains section separation
- Preserves context markers

The heartbeat function provides real-time feedback during processing:
- Shows elapsed time
- Indicates active processing
- Helps monitor long-running operation

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Union, Optional, Tuple
import base64
import json
import os
import threading
import time
from pathlib import Path
import markdown
from bs4 import BeautifulSoup

@dataclass
class BatchMetadata:
    """
    Enhanced metadata container for tracking mixed content batches.
    """
    batch_number: int
    total_batches: int
    content_types: List[str]
    batch_size: int
    source_files: List[str]

class ContentBatch:
    """
    Container for managing mixed content types within a batch.
    """
    def __init__(self, max_size: int = 100000):
        self.json_content: List[Dict] = []
        self.markdown_content: List[str] = []
        self.image_paths: List[str] = []
        self.current_size: int = 0
        self.max_size: int = max_size
        
    def can_add(self, content_size: int) -> bool:
        return self.current_size + content_size <= self.max_size
    
    def add_content(self, content_type: str, content: Union[Dict, str, Path]):
        if content_type == 'json':
            self.json_content.append(content)
            self.current_size += len(json.dumps(content))
        elif content_type == 'markdown':
            self.markdown_content.append(content)
            self.current_size += len(content.encode('utf-8'))
        elif content_type == 'image':
            self.image_paths.append(str(content))
            # Approximate image contribution to context
            self.current_size += 1000  # Conservative estimate

class EnhancedClaudeProcessor(ClaudeProcessor):
    """
    Enhanced processor supporting mixed content types and intelligent batching.
    """
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.markdown_converter = markdown.Markdown()
        
    def _process_markdown(self, markdown_text: str) -> str:
        """
        Convert markdown to plain text while preserving structure.
        """
        html = self.markdown_converter.convert(markdown_text)
        soup = BeautifulSoup(html, 'html.parser')
        return soup.get_text(separator='\n\n')
    
    def _create_batches(
        self,
        json_data: List[Dict],
        markdown_files: List[Path],
        image_paths: List[Path]
    ) -> List[Tuple[ContentBatch, BatchMetadata]]:
        """
        Create optimized batches of mixed content.
        """
        batches = []
        current_batch = ContentBatch(self.max_chunk_size)
        
        # Process markdown files first to establish context
        for md_file in markdown_files:
            with open(md_file, 'r', encoding='utf-8') as f:
                content = self._process_markdown(f.read())
                if not current_batch.can_add(len(content.encode('utf-8'))):
                    batches.append(current_batch)
                    current_batch = ContentBatch(self.max_chunk_size)
                current_batch.add_content('markdown', content)
        
        # Add JSON content
        for item in json_data:
            content_size = len(json.dumps(item))
            if not current_batch.can_add(content_size):
                batches.append(current_batch)
                current_batch = ContentBatch(self.max_chunk_size)
            current_batch.add_content('json', item)
        
        # Add images strategically across batches
        images_per_batch = max(1, len(image_paths) // (len(batches) + 1))
        for i, img_path in enumerate(image_paths):
            batch_idx = min(i // images_per_batch, len(batches))
            if batch_idx == len(batches):
                current_batch.add_content('image', img_path)
            else:
                batches[batch_idx].add_content('image', img_path)
        
        if current_batch.json_content or current_batch.markdown_content or current_batch.image_paths:
            batches.append(current_batch)
            
        # Create metadata for each batch
        return [(batch, BatchMetadata(
            batch_number=i+1,
            total_batches=len(batches),
            content_types=self._get_content_types(batch),
            batch_size=batch.current_size,
            source_files=self._get_source_files(batch)
        )) for i, batch in enumerate(batches)]
    
    def _prepare_mixed_message(
        self,
        batch: ContentBatch,
        metadata: BatchMetadata,
        prompt: str
    ) -> List[Dict]:
        """
        Prepare a message containing mixed content types.
        """
        content = []
        
        # Add context about the batch
        batch_context = f"""Processing batch {metadata.batch_number} of {metadata.total_batches}
        Content types present: {', '.join(metadata.content_types)}
        Source files: {', '.join(metadata.source_files)}
        """
        
        # Add images first
        for img_path in batch.image_paths:
            with open(img_path, "rb") as img_file:
                base64_image = base64.b64encode(img_file.read()).decode('utf-8')
                content.append({
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": f"image/{Path(img_path).suffix[1:]}",
                        "data": base64_image
                    }
                })
        
        # Combine markdown and JSON content
        text_content = batch_context + "\n\n"
        
        if batch.markdown_content:
            text_content += "=== Markdown Content ===\n\n"
            text_content += "\n\n".join(batch.markdown_content)
            
        if batch.json_content:
            text_content += "\n\n=== JSON Content ===\n\n"
            text_content += json.dumps(batch.json_content, indent=2)
            
        text_content += f"\n\n{prompt}"
        
        content.append({
            "type": "text",
            "text": text_content
        })
        
        return [{"role": "user", "content": content}]
    
    def process_mixed_content(
        self,
        json_data: List[Dict],
        markdown_files: List[Path],
        image_paths: List[Path],
        prompt: str
    ) -> str:
        """
        Process mixed content types in optimized batches.
        """
        batches = self._create_batches(json_data, markdown_files, image_paths)
        responses = []
        
        for batch, metadata in batches:
            start_time = time.time()
            stop_event = threading.Event()
            heartbeat_thread = threading.Thread(
                target=heartbeat, 
                args=(stop_event, start_time)
            )
            heartbeat_thread.start()
            
            try:
                messages = self._prepare_mixed_message(batch, metadata, prompt)
                response = self.client.messages.create(
                    model=self.model_name,
                    max_tokens=self.max_tokens,
                    messages=messages
                )
                responses.append(response.content[0].text)
            
            except Exception as e:
                print(f"Error processing batch {metadata.batch_number}: {str(e)}")
                raise
            finally:
                stop_event.set()
                heartbeat_thread.join()
        
        return self.combine_responses(responses)

### Running the processor

In [None]:
# # Initialize the processor
# processor = ClaudeProcessor(
#     api_key=os.getenv('ANTHROPIC_API_KEY'),
#     model_name="claude-3-5-sonnet-20240620"
# )

# # Load your JSON data
# with open('path_to_your_json.json', 'r') as f:
#     json_data = json.load(f)

# # List of image paths to include
# image_paths = ['path_to_image1.jpg', 'path_to_image2.png']

# # Define your prompt
# prompt = """Please analyze this portion of the Implementation Guide and provide:
# 1. Key information and requirements
# 2. Notable patterns or constraints
# 3. How this section relates to the overall IG

# Focus on new information not covered in previous chunks."""

# # Process the JSON and get responses
# responses = processor.process_json(json_data, prompt, image_paths)

# # Combine all responses into a final analysis
# final_analysis = processor.combine_responses(responses)
# print(final_analysis)

In [None]:
# Initialize the enhanced processor
processor = EnhancedClaudeProcessor(
    api_key=os.getenv('ANTHROPIC_API_KEY'),
    model_name="claude-3-5-sonnet-20240620"
)

# Prepare your content paths
json_folder = Path('full-ig/json_only')
markdown_folder = Path('full-ig/markdown')
image_folder = Path('full-ig/images')

# Load JSON data
json_files = list(json_folder.glob('*_combined.json'))
json_data = []
for json_file in json_files:
    with open(json_file, 'r') as f:
        json_data.extend(json.load(f)['entry'])

# Get markdown and image files
markdown_files = list(markdown_folder.glob('*.md'))
image_paths = list(image_folder.glob('*.{jpg,png,jpeg}'))

# Define your analysis prompt
prompt = """Please analyze this content batch and provide:
1. Key information and requirements from both the markdown documentation and JSON structures
2. Relationships between the documentation and implementation details
3. Notable patterns or constraints
4. How this section relates to the overall Implementation Guide

Focus on new information not covered in previous batches."""

# Process all content
final_analysis = processor.process_mixed_content(
    json_data=json_data,
    markdown_files=markdown_files,
    image_paths=image_paths,
    prompt=prompt
)

# Save the analysis
with open('analysis_output.md', 'w') as f:
    f.write(final_analysis)