## LLM requirements extraction from IG with markdown content only
37 minutes

In [31]:
import os
import logging
from typing import List, Dict, Union, Optional, Any
import time
import json
from datetime import datetime
import re
import pandas as pd
from dotenv import load_dotenv
import httpx
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
from anthropic import Anthropic, RateLimitError
import google.generativeai as gemini
from openai import OpenAI
from pathlib import Path


In [32]:
# Get the current working directory and set up paths
PROJECT_ROOT = Path.cwd().parent  # Go up one level from reqs_extraction to onclaive root
MARKDOWN_DIR = os.path.join(PROJECT_ROOT, 'full-ig', 'markdown')

# Add debug logging
logging.basicConfig(level=logging.DEBUG)
logging.info(f"Current working directory: {Path.cwd()}")
logging.info(f"Project root: {PROJECT_ROOT}")
logging.info(f"Markdown directory: {MARKDOWN_DIR}")

# Verify the markdown directory exists
if os.path.exists(MARKDOWN_DIR):
    logging.info(f"Found markdown directory at {MARKDOWN_DIR}")
    markdown_files = [f for f in os.listdir(MARKDOWN_DIR) if f.endswith('.md')]
    logging.info(f"Found {len(markdown_files)} markdown files")
else:
    logging.error(f"Markdown directory not found at {MARKDOWN_DIR}")

INFO:root:Current working directory: /Users/ceadams/Documents/onclaive/onclaive/reqs_extraction
INFO:root:Project root: /Users/ceadams/Documents/onclaive/onclaive
INFO:root:Markdown directory: /Users/ceadams/Documents/onclaive/onclaive/full-ig/markdown
INFO:root:Found markdown directory at /Users/ceadams/Documents/onclaive/onclaive/full-ig/markdown
INFO:root:Found 7 markdown files


In [33]:
# Basic setup
load_dotenv()

# API Configuration
API_CONFIGS = {
    "claude": {
        "model_name": "claude-3-5-sonnet-20240620",
        "max_tokens": 8192,
        "temperature": 0.7,
        "batch_size": 3,
        "delay_between_chunks": 2,
        "delay_between_batches": 10,
        "requests_per_minute": 25,
        "max_requests_per_day": 5000,
        "delay_between_requests": 2
    },
    "gemini": {
        "model": "models/gemini-1.5-pro-001",
        "max_tokens": 8192,
        "temperature": 0.7,
        "batch_size": 1,  # More conservative batch size
        "delay_between_chunks": 5,
        "delay_between_batches": 30,
        "requests_per_minute": 30,
        "max_requests_per_day": 60000,
        "delay_between_requests": 3,
        "timeout": 120  # Longer timeout for larger content
    },
    "gpt": {
        "model": "gpt-4",
        "max_tokens": 2048,
        "temperature": 0.7,
        "batch_size": 3,
        "delay_between_chunks": 3,
        "delay_between_batches": 15,
        "requests_per_minute": 200,
        "max_requests_per_day": 10000,
        "delay_between_requests": 0.5
    }
}

SYSTEM_PROMPTS = {
    "claude": """You are a seasoned Healthcare Integration Test Engineer 
                analyzing a FHIR Implementation Guide to extract precise testable requirements.""",
    "gemini": """Analyze FHIR Implementation Guide content to identify 
                 testable requirements as a Healthcare Integration Test Engineer.""",
    "gpt": """As a Healthcare Integration Test Engineer, analyze this FHIR 
              Implementation Guide content to extract specific testable requirements."""
}


In [34]:
def list_markdown_files(markdown_dir):
    """Debug function to list all markdown files"""
    if not os.path.exists(markdown_dir):
        logging.error(f"Directory does not exist: {markdown_dir}")
        return
    
    files = [f for f in os.listdir(markdown_dir) if f.endswith('.md')]
    logging.info(f"Found {len(files)} markdown files:")
    for file in files:
        logging.info(f"  - {file}")
    return files

In [35]:

# Markdown Processing Functions
def clean_markdown(text: str) -> str:
    """Clean markdown content"""
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\\(.)', r'\1', text)
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'[-\s]*\n[-\s]*', '\n', text)
    return text.strip()

def split_markdown(content: str, max_size: int = 2000) -> List[str]:
    """Split markdown into manageable chunks"""
    chunks = []
    lines = content.split('\n')
    current_chunk = []
    current_size = 0
    
    for line in lines:
        line_size = len(line)
        if current_size + line_size > max_size:
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
            current_size = line_size
        else:
            current_chunk.append(line)
            current_size += line_size
            
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    return chunks


In [36]:

# Rate Limiting
def create_rate_limiter():
    """Create a rate limiter state dictionary for all APIs"""
    return {
        api: {
            'requests': [],
            'daily_requests': 0,
            'last_reset': time.time()
        }
        for api in API_CONFIGS.keys()
    }

def check_rate_limits(rate_limiter: dict, api: str):
    """Check and wait if rate limits would be exceeded"""
    if api not in rate_limiter:
        raise ValueError(f"Unknown API: {api}")
        
    now = time.time()
    state = rate_limiter[api]
    config = API_CONFIGS[api]
    
    # Reset daily counts if needed
    day_seconds = 24 * 60 * 60
    if now - state['last_reset'] >= day_seconds:
        state['daily_requests'] = 0
        state['last_reset'] = now
    
    # Check daily limit
    if state['daily_requests'] >= config['max_requests_per_day']:
        raise Exception(f"{api} daily request limit exceeded")
    
    # Remove old requests outside the current minute
    state['requests'] = [
        req_time for req_time in state['requests']
        if now - req_time < 60
    ]
    
    # Wait if at rate limit
    if len(state['requests']) >= config['requests_per_minute']:
        sleep_time = 60 - (now - state['requests'][0])
        if sleep_time > 0:
            time.sleep(sleep_time)
        state['requests'] = state['requests'][1:]
    
    # Add minimum delay between requests
    if state['requests'] and now - state['requests'][-1] < config['delay_between_requests']:
        time.sleep(config['delay_between_requests'])
    
    # Record this request
    state['requests'].append(now)
    state['daily_requests'] += 1


In [37]:

# Prompting Functions
def create_requirements_extraction_prompt(content: str) -> str:
    """Create a prompt that aligns with Inferno's requirements extraction process"""
    return f"""Analyze this FHIR Implementation Guide content to extract precise requirements following these guidelines:

For each requirement you identify, provide:

1. REQUIREMENT TEXT
- Extract direct quotes from the source
- For compound requirements, split into atomic requirements
- Maintain context when splitting
- Use [...] for added clarifications
- Use ... for removed text
- Format using markdown syntax for code blocks, italics, etc.

2. REQUIREMENT METADATA
- Conformance Level (SHALL, SHOULD, MAY, SHOULD NOT, SHALL NOT)
- Actor(s) the requirement applies to
- Whether the requirement is conditional (True/False)
- Any sub-requirements or referenced requirements

3. SOURCE TRACEABILITY
- Note the specific section or location this requirement comes from

When analyzing content, focus on:

a) Making requirements atomic and testable
b) Maintaining the original text while adding necessary context
c) Identifying implicit requirements for each actor
d) Distinguishing between conjunctive ("and") and disjunctive ("or") requirements
e) Capturing terminology bindings and must-support elements
f) Noting RESTful API conformance requirements
g) Identifying conditional requirements

Content to analyze:
{content}

Format each requirement as:
```
Requirement Text: <quoted text with [...] for clarifications and ... for elisions>
Conformance: <conformance level>
Actor: <actor name(s)>
Conditional: <True/False>
Sub-Requirements: <list of referenced requirements if any>
Source: <specific location in documentation>
```"""

def format_content_for_api(content: Union[str, dict, list], api_type: str) -> Union[str, List[dict], dict]:
    """Format content appropriately for each API"""
    base_prompt = create_requirements_extraction_prompt(content)
    
    if api_type == "claude":
        return [{
            "type": "text",
            "text": base_prompt
        }]
    elif api_type == "gemini":
        return [{  # Changed from dict to list with single dict
            "parts": [{
                "text": base_prompt
            }]
        }]
    return base_prompt


In [38]:
@retry(
    wait=wait_exponential(multiplier=1, min=4, max=60),
    stop=stop_after_attempt(5),
    retry=retry_if_exception_type((RateLimitError, TimeoutError))
)
def make_api_request(client, api_type: str, content: str, rate_limit_func) -> str:
    """Make rate-limited API request with retries"""
    rate_limit_func()
    
    config = API_CONFIGS[api_type]
    formatted_content = format_content_for_api(content, api_type)
    
    try:
        if api_type == "claude":
            response = client.messages.create(
                model=config["model_name"],
                max_tokens=config["max_tokens"],
                messages=[{
                    "role": "user", 
                    "content": formatted_content
                }],
                system=SYSTEM_PROMPTS[api_type]
            )
            return response.content[0].text
            
        elif api_type == "gemini":
            # Extract the text content for Gemini
            prompt_text = formatted_content[0]["parts"][0]["text"]
            response = client.generate_content(
                prompt_text,
                generation_config={
                    "max_output_tokens": config["max_tokens"],
                    "temperature": config["temperature"]
                }
            )
            if hasattr(response, 'text'):
                return response.text
            elif response.candidates:
                return response.candidates[0].content.parts[0].text
            else:
                raise ValueError("No response generated from Gemini API")
                    
        elif api_type == "gpt":
            response = client.chat.completions.create(
                model=config["model"],
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPTS[api_type]},
                    {"role": "user", "content": formatted_content}
                ],
                max_tokens=config["max_tokens"],
                temperature=config["temperature"]
            )
            return response.choices[0].message.content
            
    except Exception as e:
        logging.error(f"Error in {api_type} API request: {str(e)}")
        raise

In [39]:

def process_content_batch(api_type: str, contents: List[str], 
                       config: dict, client, rate_limit_func) -> List[str]:
    """Process a batch of content with rate limiting"""
    results = []
    for content in contents:
        result = make_api_request(client, api_type, content, rate_limit_func)
        results.append(result)
        time.sleep(config["delay_between_chunks"])
    return results

# Results Processing
def process_llm_requirements_output(output: str) -> List[Dict]:
    """Process LLM output into standardized requirements format"""
    requirements = []
    current_req = {}
    
    # Split output into individual requirements
    req_blocks = output.split('\n\n')
    
    for block in req_blocks:
        if block.strip().startswith('Requirement Text:'):
            # Save previous requirement if it exists
            if current_req:
                requirements.append(current_req)
                current_req = {}
            
            # Parse new requirement
            lines = block.strip().split('\n')
            for line in lines:
                if ': ' in line:
                    key, value = line.split(': ', 1)
                    key = key.lower().replace(' ', '_')
                    current_req[key] = value.strip()
    
    # Add final requirement
    if current_req:
        requirements.append(current_req)
        
    return requirements


In [40]:

def save_requirements_to_csv(requirements: List[Dict], output_file: str):
    """Save extracted requirements to CSV format"""
    df = pd.DataFrame(requirements)
    
    # Rename columns to match Inferno's format
    column_mapping = {
        'requirement_text': 'Requirement',
        'conformance': 'Conformance',
        'actor': 'Actor',
        'conditional': 'Conditionality',
        'source': 'URL',
        'sub_requirements': 'Sub-Requirement(s)'
    }
    
    df = df.rename(columns=column_mapping)
    
    # Add required columns if missing
    required_columns = ['Req Set', 'Id'] + list(column_mapping.values())
    for col in required_columns:
        if col not in df.columns:
            df[col] = ''
            
    # Generate sequential IDs if not present
    if 'Id' in df.columns and df['Id'].isna().all():
        df['Id'] = range(1, len(df) + 1)
        
    df.to_csv(output_file, index=False)


In [41]:

# Main Processing
def setup_clients():
    """Initialize clients for each LLM service"""
    try:
        # Claude setup
        verify_path = '/opt/homebrew/etc/openssl@3/cert.pem'
        http_client = httpx.Client(
            verify=verify_path if os.path.exists(verify_path) else True,
            timeout=60.0
        )
        claude_client = Anthropic(
            api_key=os.getenv('ANTHROPIC_API_KEY'),
            http_client=http_client
        )
        
        # Gemini setup
        gemini_api_key = os.getenv('GEMINI_API_KEY')
        if not gemini_api_key:
            raise ValueError("GEMINI_API_KEY not found")
        gemini.configure(api_key=gemini_api_key)
        gemini_client = gemini.GenerativeModel(
            model_name=API_CONFIGS["gemini"]["model"],
            generation_config={
                "max_output_tokens": API_CONFIGS["gemini"]["max_tokens"],
                "temperature": API_CONFIGS["gemini"]["temperature"]
            }
        )
        
        # OpenAI setup
        openai_api_key = os.getenv('OPENAI_API_KEY')
        if not openai_api_key:
            raise ValueError("OPENAI_API_KEY not found")
        openai_client = OpenAI(
            api_key=openai_api_key,
            timeout=60.0
        )
        
        return {
            "claude": claude_client,
            "gpt": openai_client,
            "gemini": gemini_client
        }
        
    except Exception as e:
        logging.error(f"Error setting up clients: {str(e)}")
        raise

def process_markdown_content(api_type: str, markdown_dir: str = MARKDOWN_DIR) -> Dict[str, Any]:
    """Process all markdown content and generate requirements"""
    logging.info(f"Starting processing with {api_type} on directory: {markdown_dir}")
    
    # List files before processing
    markdown_files = list_markdown_files(markdown_dir)
    if not markdown_files:
        logging.error("No markdown files found to process")
        return {"requirements": [], "processed_files": [], "output_file": None}
        
    clients = setup_clients()
    client = clients[api_type]
    config = API_CONFIGS[api_type]
    rate_limiter = create_rate_limiter()
    
    def check_limits():
        check_rate_limits(rate_limiter, api_type)
    
    try:
        all_requirements = []
        processed_files = []
        
        for md_file in markdown_files:
            file_path = os.path.join(markdown_dir, md_file)
            logging.info(f"Processing {md_file}")
            
            with open(file_path, 'r') as f:
                content = clean_markdown(f.read())
                logging.debug(f"Content length for {md_file}: {len(content)} characters")
                
            chunks = split_markdown(content)
            logging.info(f"Split {md_file} into {len(chunks)} chunks")
            
            for chunk_idx, chunk in enumerate(chunks, 1):
                logging.info(f"Processing chunk {chunk_idx}/{len(chunks)} of {md_file}")
                response = make_api_request(client, api_type, chunk, check_limits)
                chunk_requirements = process_llm_requirements_output(response)
                logging.info(f"Extracted {len(chunk_requirements)} requirements from chunk")
                all_requirements.extend(chunk_requirements)
                time.sleep(config["delay_between_chunks"])
            
            processed_files.append(md_file)
            time.sleep(config["delay_between_batches"])
        
        # Save requirements to CSV
        output_directory = os.path.join(PROJECT_ROOT, 'reqs_extraction', 'processed_output')
        os.makedirs(output_directory, exist_ok=True)
        output_file = os.path.join(output_directory, f"test_requirements_{api_type}_markdown1.csv")
        save_requirements_to_csv(all_requirements, output_file)
        
        logging.info(f"Completed processing {len(processed_files)} files, extracted {len(all_requirements)} requirements")
        
        return {
            "requirements": all_requirements,
            "processed_files": processed_files,
            "output_file": output_file
        }
        
    except Exception as e:
        logging.error(f"Error processing content: {str(e)}")
        raise


In [42]:
# Define input and output directories using absolute paths
markdown_dir = MARKDOWN_DIR
output_directory = os.path.join(PROJECT_ROOT, 'reqs_extraction', 'processed_output')

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process with each API
#apis = ["claude", "gemini", "gpt"]
apis=['gemini']
results = {}

for api_type in apis:
    try:
        logging.info(f"Processing with {api_type}...")
        api_results = process_markdown_content(api_type, markdown_dir)
        results[api_type] = api_results
        
        # Save JSON results
        json_output = os.path.join(output_directory, f"test_requirements_{api_type}_markdown1.json")
        with open(json_output, 'w') as f:
            json.dump(api_results, f, indent=2)
        logging.info(f"Saved {api_type} results to {json_output}")
        
    except Exception as e:
        logging.error(f"Error processing {api_type}: {str(e)}")
        continue

INFO:root:Processing with gemini...
INFO:root:Starting processing with gemini on directory: /Users/ceadams/Documents/onclaive/onclaive/full-ig/markdown
INFO:root:Found 7 markdown files:
INFO:root:  - implementation.md
INFO:root:  - examples.md
INFO:root:  - profiles.md
INFO:root:  - ChangeHistory.md
INFO:root:  - artifacts.md
INFO:root:  - index.md
INFO:root:  - CapabilityStatement_plan_net.md
INFO:root:Processing implementation.md
INFO:root:Split implementation.md into 8 chunks
INFO:root:Processing chunk 1/8 of implementation.md
INFO:root:Extracted 0 requirements from chunk
INFO:root:Processing chunk 2/8 of implementation.md
INFO:root:Extracted 0 requirements from chunk
INFO:root:Processing chunk 3/8 of implementation.md
INFO:root:Extracted 4 requirements from chunk
INFO:root:Processing chunk 4/8 of implementation.md
INFO:root:Extracted 0 requirements from chunk
INFO:root:Processing chunk 5/8 of implementation.md
INFO:root:Extracted 3 requirements from chunk
INFO:root:Processing chunk