# Exploring FHIR Implementation Guides (IGs) + LLMs

In this notebook, we aim to explore how much LLMs understand about FHIR Implementation Guides (IGs) and investigate ways to upload IG content for deeper analysis.

### Import relevant libraries

Make sure you have langchain-connunity and beautifulsoup4 installed

In [1]:
# %pip install -U langchain-community bs4

In [1]:
import os
import google.generativeai as gemini
from anthropic import Anthropic
from openai import OpenAI
import io, threading, time, re, json
import pandas as pd
from json_repair import repair_json
from langchain_community.document_loaders import BSHTMLLoader
import shutil
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


### Read in US Core IG HTML files

NOTE: Be sure that you have downloaded the US Core IG HTML files and placed them in your current directory

In [2]:
source_folder = 'full-ig/site'
destination_folder = 'full-ig/html_only'

In [3]:
# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

In [4]:
# List to store only .html files
html_files = []

In [5]:
for file_name in os.listdir(source_folder):
    # Check if the file ends with .html but not with compound extensions
    if file_name.endswith('.html') and not (file_name.endswith('.ttl.html') or 
                                             file_name.endswith('.json.html') or 
                                             file_name.endswith('.xml.html') or 
                                             file_name.endswith('.change.history.html')):
        html_files.append(file_name)
        # Move the file to the destination folder
        shutil.copy(os.path.join(source_folder, file_name), destination_folder)

### Loading HTML with BeautifulSoup4

In [6]:
html_only_folder = 'full-ig/html_only'

In [7]:
# Create a new folder named "plain_text" inside the current directory
processed_files_path = os.path.join(html_only_folder, 'plain_txt')

# Create the destination folder if it doesn't exist
if not os.path.exists(processed_files_path):
    os.makedirs(processed_files_path)

In [8]:
# List to store the files processed
processed_files = []

In [9]:
# Loop through the files in the HTML folder
for file_name in os.listdir(html_only_folder):
    # Full path to the .html file
    html_file_path = os.path.join(html_only_folder, file_name)
    
    # Check if it's a file (not a directory)
    if os.path.isfile(html_file_path):
        # Use BSHTMLLoader to load the HTML content
        loader = BSHTMLLoader(html_file_path, bs_kwargs={'features': 'html.parser'})
        data = loader.load()
        # Extract the plain text from the loaded data
        plain_text = '\n'.join([doc.page_content for doc in data])
        
        # Create the output file path with .txt extension
        txt_file_name = file_name.replace('.html', '.txt')
        txt_file_path = os.path.join(processed_files_path, txt_file_name)
        
        # Write the extracted plain text to the new .txt file
        with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(plain_text)
        
        # Append to processed files list
        processed_files.append(txt_file_name)

Read in API keys for Claude, Gemini, and GPT from .env file

In [26]:
load_dotenv()

claude_api_key = os.getenv('ANTHROPIC_API_KEY')
gemini_api_key = os.getenv('GEMINI_API_KEY')
OpenAI.api_key = os.getenv('OPENAI_API_KEY')

## Combining text files

To do: need to better clean combined text

In [53]:
#function to combine all text files

def combine_txt_files(directory_path):
    """Combines all .txt files in the specified directory into a single string."""
    combined_text = []
    #iterate through txt files in directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    #append text from txt file to combined_text
                    combined_text.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    return "\n".join(combined_text)

# Directory where txt files are located
txt_directory = 'full-ig/html_only/plain_txt'
# create combined text object
combined_content = combine_txt_files(txt_directory)

## Sending IG through LLM

Setting up Claude

In [23]:
claude = Anthropic(api_key = claude_api_key)
claude_version = "claude-3-5-sonnet-20240620"  # "claude-3-opus-20240229"   "claude-3-5-sonnet-20240620" "claude-3-sonnet-20240229" "claude-3-haiku-20240307"
claude_max_output_tokens = 8192  # claude 3 opus is only 4096 tokens, sonnet is 8192

Functions to send IG content Claude and request analysis

In [63]:
#function to add sleep pauses
def heartbeat(stop_event, start_time):
    """Prints elapsed time periodically until stopped."""
    while not stop_event.is_set():
        elapsed = time.time() - start_time
        print(f"... still processing ({elapsed:.1f}s elapsed)")
        time.sleep(5)

#send message request to claude letting it know an IG is being shared and providing it the action prompt
def message_claude(claude_client, user_prompt, content_text, max_retries=3):
    """
    Sends a message to Claude API with the provided prompt and content.
    """
    # Construct the full prompt
    full_prompt = f"""Here is the content of an HL7 FHIR Implementation Guide:

{content_text}

{user_prompt}"""
    
    # Set up heartbeat
    start_time = time.time()
    stop_event = threading.Event()
    heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
    heartbeat_thread.start()
    
    retry_count = 0
    last_error = None
    
    while retry_count < max_retries:
        try:
            print(f"Sending request to Claude API (attempt {retry_count + 1}/{max_retries})...")
            
            response = claude_client.messages.create(
                model=claude_version,
                max_tokens=claude_max_output_tokens,
                messages=[{"role": "user", "content": full_prompt}],
                temperature=0.7
            )
            
            print("Successfully received response from Claude API")
            response_text = response.content[0].text
            return response, response_text
            
        except Exception as e:
            last_error = e
            retry_count += 1
            if retry_count < max_retries:
                wait_time = 2 ** retry_count
                print(f"Error occurred: {str(e)}")
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed after {max_retries} attempts. Last error: {str(e)}")
                raise
        finally:
            stop_event.set()
            heartbeat_thread.join()

#analyze content of IG
def analyze_ig(combined_content, prompt):
    """
    Main function to process IG files and get Claude's analysis.
    """
    try:
        # Initialize Claude client
        claude_client = create_anthropic_client()
        
        #confirm combined text object has been created
        if not combined_content:
            raise ValueError("No content found in text files")
        
        #print characters of combined text object
        print(f"Combined content length: {len(combined_content)} characters")
        
        print("Sending to Claude API...")
        response, response_text = message_claude(claude_client, prompt, combined_content)
        
        return response_text
        
    except Exception as e:
        print(f"Error in analyze_ig: {str(e)}")
        raise

In [64]:
# draft prompt to ask LLM to summarize IG text
prompt = """Please analyze this Implementation Guide and provide:
1. A high-level summary of what this IG is about
2. Key profiles and extensions defined
3. Main requirements and constraints
4. Notable usage patterns or guidance

Please organize the information clearly and highlight particularly important aspects."""

# analyze partial combined text (text currently too large to all be ingested)
result = analyze_ig(combined_content[1:15000], prompt)
print(result)

Combined content length: 14999 characters
Sending to Claude API...
... still processing (0.0s elapsed)
Sending request to Claude API (attempt 1/3)...
... still processing (5.0s elapsed)
... still processing (10.0s elapsed)
... still processing (15.0s elapsed)
Successfully received response from Claude API
Here's a high-level analysis of the HL7 FHIR US Core Implementation Guide based on the provided content:

1. High-level summary:
This is the US Core Implementation Guide, version 8.0.0-ballot. It defines a set of FHIR profiles, extensions, and other artifacts to represent core health data for use in the United States healthcare system. The guide aims to establish a consistent foundation for FHIR implementations in the US.

2. Key profiles and extensions:
While not explicitly listed in the provided content, the IG likely includes profiles for common resources like Patient, Observation, Condition, etc. The content shows:
- A ValueSet for Clinical Result Observation Categories
- A Search

Experimenting with sending the IG in smaller chunks
Note: incomplete

In [None]:
#this function will no longer properly run due to changes made to how the text files are combined earlier in the script

#defining function to split IG content into chunks
def split_content(text, max_bytes=8000000):  # Leave some room for the prompt
    """
    Splits content into chunks that won't exceed Claude's byte limit.
    Tries to split at file boundaries marked by === Content from
    """
    
    # First split by file markers
    file_sections = text.split("=== Content from")
    
    chunks = []
    current_chunk = ""
    current_bytes = 0
    
    for section in file_sections:
        if not section.strip():
            continue
            
        # Add the marker back except for the first section
        if current_chunk:
            section = "=== Content from" + section
            
        # Calculate bytes of this section
        section_bytes = len(section.encode('utf-8'))
        
        # If adding this section would exceed limit, start new chunk
        if current_bytes + section_bytes > max_bytes:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = section
            current_bytes = section_bytes
        else:
            current_chunk += section
            current_bytes += section_bytes
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def message_claude(claude_client, user_prompt, content_text, max_retries=3):
    """
    Sends a message to Claude API with the provided prompt and content.
    """
    # Construct the full prompt
    full_prompt = f"""Here is the content of an HL7 FHIR Implementation Guide:

{content_text}

{user_prompt}"""
    # Check content length in bytes
    prompt_bytes = len(full_prompt.encode('utf-8'))
    if prompt_bytes > 9000000:  # Claude's limit
        raise ValueError(f"Content too large: {prompt_bytes} bytes")
    
    # Set up heartbeat
    start_time = time.time()
    stop_event = threading.Event()
    heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_event, start_time))
    heartbeat_thread.start()
    
    retry_count = 0
    last_error = None
    
    while retry_count < max_retries:
        try:
            print(f"Sending request to Claude API (attempt {retry_count + 1}/{max_retries})...")
            
            response = claude_client.messages.create(
                model=claude_version,
                max_tokens=claude_max_output_tokens,
                messages=[{"role": "user", "content": full_prompt}],
                temperature=0.7
            )
            
            print("Successfully received response from Claude API")
            response_text = response.content[0].text
            return response, response_text
            
        except Exception as e:
            last_error = e
            retry_count += 1
            if retry_count < max_retries:
                wait_time = 2 ** retry_count
                print(f"Error occurred: {str(e)}")
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed after {max_retries} attempts. Last error: {str(e)}")
                raise
        finally:
            stop_event.set()
            heartbeat_thread.join()

def analyze_ig_in_chunks(txt_directory, prompt):
    """
    Analyzes the IG content in chunks and combines the results.
    """
    try:
        # Initialize Claude client
        claude_client = create_anthropic_client()
        
        # Combine all txt files
        print("Combining text files...")
        combined_content = combine_txt_files(txt_directory)
        
        if not combined_content:
            raise ValueError("No content found in text files")
        
        print(f"Total combined content length: {len(combined_content)} characters")
        
        # Split content into chunks
        chunks = split_content(combined_content)
        print(f"Split content into {len(chunks)} chunks")
        
        # Process each chunk
        all_responses = []
        for i, chunk in enumerate(chunks, 1):
            print(f"\nProcessing chunk {i} of {len(chunks)}...")
            
            # Modify prompt for chunks after the first one
            if i > 1:
                chunk_prompt = f"""This is chunk {i} of {len(chunks)} from the same Implementation Guide. 
                Please continue the analysis, focusing on any new information in this chunk. 
                Do not repeat information you've already covered, only add new findings.
                
                {prompt}"""
            else:
                chunk_prompt = prompt
            
            response, response_text = message_claude(claude_client, chunk_prompt, chunk)
            all_responses.append(response_text)
            
            print(f"Completed chunk {i}")
        
        # Combine all responses
        final_response = "\n\n=== Combined Analysis ===\n\n" + "\n\n=== Additional Findings ===\n\n".join(all_responses)
        
        return final_response
        
    except Exception as e:
        print(f"Error in analyze_ig_in_chunks: {str(e)}")
        raise


In [50]:
result = analyze_ig_in_chunks(txt_directory, prompt)
print(result)

Combining text files...
Total combined content length: 9993422 characters
Split content into 2 chunks

Processing chunk 1 of 2...
... still processing (0.0s elapsed)
Sending request to Claude API (attempt 1/3)...
Error occurred: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'prompt is too long: 203298 tokens > 199999 maximum'}}
Retrying in 2 seconds...
... still processing (5.0s elapsed)
Sending request to Claude API (attempt 2/3)...
Error occurred: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'prompt is too long: 203298 tokens > 199999 maximum'}}
Retrying in 4 seconds...
Sending request to Claude API (attempt 3/3)...
Failed after 3 attempts. Last error: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'prompt is too long: 203298 tokens > 199999 maximum'}}
Error in analyze_ig_in_chunks: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_e

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'prompt is too long: 203298 tokens > 199999 maximum'}}

TODO: Read in relevant context files 
- IG_golden_rules
- IG_example
- IG_profile

TODO: Prompts