In [None]:
!pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib pdfplumber pymupdf4llm tqdm

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf4llm
  Downloading pymupdf4llm-0.0.27-py3-none-any.whl.metadata (4.8 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting pymupdf>=1.26.3 (from pymupdf4llm)
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pymupdf4llm pdfplumber pandas requests
!curl -fsSL https://ollama.com/install.sh | sh
import subprocess
import time

# Start the ollama serve process in the background
# We redirect its output to a log file to keep our notebook clean
server_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=open("ollama_server.log", "w"),
    stderr=subprocess.STDOUT
)

print("✅ Ollama server started in the background.")
print("You can now run other cells!")

# Give the server a few seconds to start up before you run other commands
time.sleep(5)
! ollama pull llama3.2:3b

In [None]:
#!/usr/bin/env python3
"""
PDF Text Extractor for Pharmaceutical Documents
Extracts text content from all PDF files in a directory and saves structured data
"""

import json
import os
from pathlib import Path
from typing import Dict, Any
from datetime import datetime
import pymupdf4llm
import pdfplumber

class PDFTextExtractor:
    def __init__(self, pdf_directory: str = "pdf", output_directory: str = "extracted_data"):
        """
        Initialize PDF text extractor

        Args:
            pdf_directory: Directory containing PDF files to process
            output_directory: Directory to save extracted data
        """
        self.pdf_directory = Path(pdf_directory)
        self.output_directory = Path(output_directory)
        self.extracted_files = []

        # Create output directory if it doesn't exist
        self.output_directory.mkdir(exist_ok=True)

    def extract_pdf_content(self, pdf_path: Path) -> str:
        """Extract text content from PDF using multiple methods"""
        try:
            # Try pdfplumber first - better for structured documents
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page_num, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        # Add page marker for structure analysis
                        all_text.append(f"=== PAGE {page_num + 1} ===\n{page_text}")

                if all_text:
                    print(f"✅ Extracted {len(all_text)} pages using pdfplumber")
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"⚠️ pdfplumber failed for {pdf_path.name}: {e}")

        try:
            # Fallback to pymupdf4llm - better for complex layouts
            content = pymupdf4llm.to_markdown(str(pdf_path))
            if content:
                print(f"✅ Extracted content using pymupdf4llm")
                return content
        except Exception as e:
            print(f"⚠️ pymupdf4llm failed for {pdf_path.name}: {e}")

        raise Exception("All extraction methods failed")

    def identify_document_sections(self, text: str) -> Dict[str, Any]:
        """
        Identify basic document sections using text patterns
        This creates a foundation for the LLM to work with
        """
        sections = {
            "headers": [],
            "potential_sections": [],
            "page_breaks": [],
            "content_blocks": []
        }

        lines = text.split('\n')
        current_section = None
        content_block = []

        for i, line in enumerate(lines):
            stripped_line = line.strip()

            # Detect page breaks
            if "=== PAGE" in line:
                sections["page_breaks"].append({
                    "line_number": i,
                    "page_marker": stripped_line
                })
                if content_block:
                    sections["content_blocks"].append({
                        "start_line": i - len(content_block),
                        "end_line": i - 1,
                        "content": "\n".join(content_block),
                        "section": current_section
                    })
                    content_block = []
                continue

            # Skip empty lines
            if not stripped_line:
                continue

            # Detect potential headers (various patterns)
            is_header = False

            # Pattern 1: All caps lines
            if stripped_line.isupper() and len(stripped_line) > 3:
                is_header = True

            # Pattern 2: Numbered sections (1., 2., etc.)
            if len(stripped_line) < 100 and any(stripped_line.startswith(f"{num}.") for num in range(1, 20)):
                is_header = True

            # Pattern 3: Lines that end with colon
            if stripped_line.endswith(':') and len(stripped_line) < 100:
                is_header = True

            # Pattern 4: Lines with specific pharmaceutical keywords
            pharma_headers = [
                'COMPOSIÇÃO', 'INDICAÇÕES', 'CONTRAINDICAÇÕES', 'POSOLOGIA',
                'REAÇÕES ADVERSAS', 'INTERAÇÕES', 'PRECAUÇÕES', 'SUPERDOSE',
                'ARMAZENAMENTO', 'APRESENTAÇÃO', 'REGISTRO', 'FABRICANTE'
            ]
            if any(keyword in stripped_line.upper() for keyword in pharma_headers):
                is_header = True

            if is_header:
                sections["headers"].append({
                    "line_number": i,
                    "text": stripped_line,
                    "confidence": "high" if stripped_line.isupper() else "medium"
                })

                # Save previous content block
                if content_block:
                    sections["content_blocks"].append({
                        "start_line": i - len(content_block),
                        "end_line": i - 1,
                        "content": "\n".join(content_block),
                        "section": current_section
                    })

                current_section = stripped_line
                content_block = []
                sections["potential_sections"].append({
                    "title": stripped_line,
                    "start_line": i,
                    "type": "inferred"
                })
            else:
                content_block.append(stripped_line)

        # Add final content block
        if content_block:
            sections["content_blocks"].append({
                "start_line": len(lines) - len(content_block),
                "end_line": len(lines) - 1,
                "content": "\n".join(content_block),
                "section": current_section
            })

        return sections

    def process_single_pdf(self, pdf_path: Path) -> Dict[str, Any]:
        """Process a single PDF file"""
        print(f"\n📄 Processing: {pdf_path.name}")

        try:
            # Extract text content
            raw_text = self.extract_pdf_content(pdf_path)

            if not raw_text:
                raise Exception("No text content extracted")

            print(f"📝 Extracted {len(raw_text):,} characters")

            # Identify document sections
            sections = self.identify_document_sections(raw_text)

            # Create structured data
            structured_data = {
                "metadata": {
                    "file_name": pdf_path.name,
                    "file_path": str(pdf_path),
                    "file_size": pdf_path.stat().st_size,
                    "extraction_date": datetime.now().isoformat(),
                    "text_length": len(raw_text),
                    "extraction_method": "multi_method",
                    "processing_stage": "text_extraction"
                },
                "raw_content": raw_text,
                "document_structure": {
                    "total_headers": len(sections["headers"]),
                    "total_sections": len(sections["potential_sections"]),
                    "total_pages": len(sections["page_breaks"]),
                    "total_content_blocks": len(sections["content_blocks"]),
                    "headers": sections["headers"],
                    "sections": sections["potential_sections"],
                    "page_breaks": sections["page_breaks"],
                    "content_blocks": sections["content_blocks"]
                },
                "extraction_statistics": {
                    "characters_extracted": len(raw_text),
                    "lines_processed": len(raw_text.split('\n')),
                    "sections_identified": len(sections["potential_sections"]),
                    "headers_found": len(sections["headers"]),
                    "extraction_quality": "good" if len(raw_text) > 1000 else "poor"
                }
            }

            print(f"📊 Found {len(sections['headers'])} headers and {len(sections['potential_sections'])} sections")
            return structured_data

        except Exception as e:
            print(f"❌ Error processing {pdf_path.name}: {e}")
            return {
                "metadata": {
                    "file_name": pdf_path.name,
                    "file_path": str(pdf_path),
                    "extraction_date": datetime.now().isoformat(),
                    "processing_stage": "text_extraction",
                    "error": str(e)
                },
                "raw_content": "",
                "document_structure": {},
                "extraction_statistics": {
                    "extraction_success": False,
                    "error_message": str(e)
                }
            }

    def process_all_pdfs(self) -> Dict[str, str]:
        """Process all PDF files in the directory"""
        print(f"🔍 Looking for PDF files in: {self.pdf_directory}")

        if not self.pdf_directory.exists():
            raise Exception(f"PDF directory not found: {self.pdf_directory}")

        pdf_files = list(self.pdf_directory.glob("*.pdf"))

        if not pdf_files:
            raise Exception(f"No PDF files found in: {self.pdf_directory}")

        print(f"📁 Found {len(pdf_files)} PDF files to process")
        print("=" * 70)

        output_files = {}
        processing_summary = {
            "total_files": len(pdf_files),
            "successful_extractions": 0,
            "failed_extractions": 0,
            "total_characters_extracted": 0,
            "files_processed": [],
            "processing_date": datetime.now().isoformat()
        }

        for pdf_file in pdf_files:
            try:
                # Process the PDF
                structured_data = self.process_single_pdf(pdf_file)

                # Create output filename
                output_filename = f"{pdf_file.stem}_extracted.json"
                output_path = self.output_directory / output_filename

                # Save the structured data
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(structured_data, f, indent=2, ensure_ascii=False)

                output_files[pdf_file.name] = str(output_path)
                self.extracted_files.append(str(output_path))

                # Update summary
                if structured_data.get("raw_content"):
                    processing_summary["successful_extractions"] += 1
                    processing_summary["total_characters_extracted"] += len(structured_data["raw_content"])
                else:
                    processing_summary["failed_extractions"] += 1

                processing_summary["files_processed"].append({
                    "input_file": pdf_file.name,
                    "output_file": output_filename,
                    "success": bool(structured_data.get("raw_content")),
                    "characters_extracted": len(structured_data.get("raw_content", "")),
                    "headers_found": structured_data.get("document_structure", {}).get("total_headers", 0)
                })

                print(f"✅ Saved: {output_path}")

            except Exception as e:
                print(f"❌ Failed to process {pdf_file.name}: {e}")
                processing_summary["failed_extractions"] += 1
                processing_summary["files_processed"].append({
                    "input_file": pdf_file.name,
                    "output_file": None,
                    "success": False,
                    "error": str(e)
                })

        # Save processing summary
        summary_path = self.output_directory / "extraction_summary.json"
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(processing_summary, f, indent=2, ensure_ascii=False)

        print("\n" + "=" * 70)
        print("📊 EXTRACTION SUMMARY")
        print("=" * 70)
        print(f"📁 Total files: {processing_summary['total_files']}")
        print(f"✅ Successful: {processing_summary['successful_extractions']}")
        print(f"❌ Failed: {processing_summary['failed_extractions']}")
        print(f"📝 Total characters: {processing_summary['total_characters_extracted']:,}")
        print(f"💾 Summary saved: {summary_path}")
        print("=" * 70)

        return output_files

    def get_extracted_files(self) -> list:
        """Get list of successfully extracted files"""
        return self.extracted_files

def main():
    """Main extraction process"""
    print("📄 PDF Text Extractor for Pharmaceutical Documents")
    print("=" * 70)

    # Initialize extractor
    extractor = PDFTextExtractor(pdf_directory="./drive/MyDrive/pdf", output_directory="./drive/MyDrive/extracted_data")

    try:
        # Process all PDFs
        output_files = extractor.process_all_pdfs()

        print(f"\n🎉 Extraction completed!")
        print(f"📁 Output directory: {extractor.output_directory}")
        print(f"📄 Files ready for LLM processing: {len(output_files)}")

        # Show next steps
        print("\n📋 Next steps:")
        print("1. Run the LLM processor on the extracted data")
        print("2. Check the extraction_summary.json for detailed results")

    except Exception as e:
        print(f"❌ Extraction failed: {e}")

if __name__ == "__main__":
    main()

📄 PDF Text Extractor for Pharmaceutical Documents
🔍 Looking for PDF files in: drive/MyDrive/pdf
📁 Found 341 PDF files to process

📄 Processing: bula_1755192077396.pdf
✅ Extracted 5 pages using pdfplumber
📝 Extracted 12,011 characters
📊 Found 40 headers and 40 sections
✅ Saved: drive/MyDrive/extracted_data/bula_1755192077396_extracted.json

📄 Processing: bula_1755192097944.pdf
✅ Extracted 16 pages using pdfplumber
📝 Extracted 47,853 characters
📊 Found 86 headers and 86 sections
✅ Saved: drive/MyDrive/extracted_data/bula_1755192097944_extracted.json

📄 Processing: bula_1755195358088.pdf
✅ Extracted 12 pages using pdfplumber
📝 Extracted 29,326 characters
📊 Found 45 headers and 45 sections
✅ Saved: drive/MyDrive/extracted_data/bula_1755195358088_extracted.json

📄 Processing: bula_1755195361693.pdf
✅ Extracted 11 pages using pdfplumber
📝 Extracted 22,964 characters
📊 Found 96 headers and 96 sections
✅ Saved: drive/MyDrive/extracted_data/bula_1755195361693_extracted.json

📄 Processing: bula_

In [None]:
#!/usr/bin/env python3
"""
Batch Automated Pharmaceutical Document Parser with Google Drive Integration
Processes multiple PDFs from a directory and saves results to Google Drive
Optimized for Google Colab environment
"""

import json
import re
import subprocess
import shlex
import os
import glob
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import pymupdf4llm
import pdfplumber
from google.colab import drive
import time

class BatchAutomatedPharmaParser:
    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize batch parser with Ollama integration for Google Colab
        """
        self.model_name = model_name
        self.batch_results = {}
        self.failed_files = []
        self.processing_stats = {
            'total_files': 0,
            'successful': 0,
            'failed': 0,
            'start_time': None,
            'end_time': None
        }
        self.setup_environment()

    def setup_environment(self):
        """Setup Google Colab environment"""
        print("🔧 Setting up Google Colab environment...")

        # Mount Google Drive
        try:
            drive.mount('/content/drive')
            print("✅ Google Drive mounted successfully")
        except Exception as e:
            print(f"⚠️ Drive mount warning: {e}")

        # Setup Ollama
        self.setup_ollama()

    def setup_ollama(self):
        """Setup Ollama model automatically"""
        print(f"Setting up Ollama model: {self.model_name}")

        try:
            # Check if ollama is available
            subprocess.run(["ollama", "--version"], capture_output=True, check=True)
            print("✅ Ollama CLI found")
        except (FileNotFoundError, subprocess.CalledProcessError):
            print("❌ Ollama CLI not found. Installing...")
            # Install Ollama in Colab
            os.system("curl -fsSL https://ollama.ai/install.sh | sh")
            # Start Ollama service
            os.system("ollama serve &")
            time.sleep(5)

        try:
            # Pull model if not available
            print(f"Pulling model {self.model_name}...")
            result = subprocess.run(
                ["ollama", "pull", self.model_name],
                capture_output=True,
                text=True,
                timeout=300
            )
            if result.returncode == 0:
                print(f"✅ Model {self.model_name} ready")
            else:
                print(f"⚠️ Pull result: {result.stderr}")
        except subprocess.TimeoutExpired:
            print("⚠️ Model pull timed out, but model might already be available")
        except Exception as e:
            print(f"⚠️ Error pulling model: {e}")

    def call_ollama_raw(self, prompt: str, extra_flags: str = "") -> str:
        """Call ollama with exact prompt - no modifications"""
        cmd = ["ollama", "run", self.model_name]
        if extra_flags:
            cmd += shlex.split(extra_flags)

        try:
            proc = subprocess.run(
                cmd,
                input=prompt,
                text=True,
                capture_output=True,
                timeout=120
            )
            output = proc.stdout.strip()
            if not output:
                output = proc.stderr.strip()
            return output
        except subprocess.TimeoutExpired:
            raise RuntimeError("Ollama call timed out")
        except Exception as e:
            raise RuntimeError(f"Error calling ollama: {e}")

    def extract_pdf_content(self, pdf_path: str) -> str:
        """Extract text content from PDF"""
        try:
            # Try pdfplumber first
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text.append(page_text)

                if all_text:
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"pdfplumber failed: {e}")

        try:
            # Fallback to pymupdf4llm
            return pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            print(f"pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def create_entity_extraction_prompt(self, text: str, chunk_size: int = 3000) -> List[str]:
        """Create prompts for entity extraction from pharmaceutical text"""

        # Base prompt for pharmaceutical entity extraction
        base_prompt = """System: You are a parser. For each Text below, extract entities, relation, value triples as a JSON array.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences. Output must be parseable by json.loads().

Format:
[
  {"entity": "...", "relation": "...", "value": "..."},
  ...
]

Focus on pharmaceutical information:
- Medication names and active ingredients
- Dosages, concentrations, and administration routes
- Indications, contraindications, and side effects
- Age groups, patient populations
- Storage conditions and expiration
- Manufacturer information

Text: """

        # Split text into chunks if too long
        text_chunks = []
        if len(text) <= chunk_size:
            text_chunks.append(text)
        else:
            words = text.split()
            current_chunk = []
            current_length = 0

            for word in words:
                if current_length + len(word) + 1 > chunk_size:
                    if current_chunk:
                        text_chunks.append(" ".join(current_chunk))
                        current_chunk = []
                        current_length = 0

                current_chunk.append(word)
                current_length += len(word) + 1

            if current_chunk:
                text_chunks.append(" ".join(current_chunk))

        # Create prompts for each chunk
        prompts = []
        for i, chunk in enumerate(text_chunks):
            prompt = f"{base_prompt}{chunk}"
            prompts.append(prompt)

        return prompts

    def create_structure_analysis_prompt(self, text: str) -> str:
        """Create prompt for document structure analysis"""

        structure_prompt = f"""System: You are a pharmaceutical document analyzer. Analyze the document structure and create a JSON summary.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences.

Format:
{{
  "document_type": "...",
  "main_sections": [
    {{
      "section_number": "...",
      "section_title": "...",
      "content_type": "...",
      "key_points": ["...", "..."]
    }}
  ],
  "medication_info": {{
    "name": "...",
    "active_ingredient": "...",
    "forms": ["...", "..."],
    "concentrations": ["...", "..."]
  }},
  "critical_information": {{
    "contraindications": ["...", "..."],
    "serious_warnings": ["...", "..."],
    "storage_conditions": "..."
  }}
}}

Text: {text[:4000]}"""

        return structure_prompt

    def parse_json_response(self, response: str) -> Any:
        """Parse JSON response, handling common formatting issues"""
        # Clean up response
        cleaned = response.strip()

        # Remove code fences if present
        if cleaned.startswith("```"):
            lines = cleaned.split('\n')
            cleaned = '\n'.join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])

        # Remove any leading/trailing text that's not JSON
        start_idx = cleaned.find('[') if cleaned.find('[') != -1 else cleaned.find('{')
        end_idx = cleaned.rfind(']') if cleaned.rfind(']') != -1 else cleaned.rfind('}')

        if start_idx != -1 and end_idx != -1:
            cleaned = cleaned[start_idx:end_idx+1]

        try:
            return json.loads(cleaned)
        except json.JSONDecodeError as e:
            print(f"JSON parse error: {e}")
            print(f"Problematic text: {cleaned[:200]}...")
            return None

    def extract_entities_from_document(self, text: str) -> List[Dict]:
        """Extract entities from entire document"""
        print("🔍 Extracting entities using LLM...")

        prompts = self.create_entity_extraction_prompt(text)
        all_entities = []

        for i, prompt in enumerate(prompts):
            print(f"Processing chunk {i+1}/{len(prompts)}...")

            try:
                response = self.call_ollama_raw(prompt)
                entities = self.parse_json_response(response)

                if entities and isinstance(entities, list):
                    all_entities.extend(entities)
                    print(f"  Extracted {len(entities)} entities from chunk {i+1}")
                else:
                    print(f"  No valid entities from chunk {i+1}")

            except Exception as e:
                print(f"  Error processing chunk {i+1}: {e}")
                continue

        # Remove duplicates
        unique_entities = []
        seen = set()
        for entity in all_entities:
            key = (entity.get('entity', ''), entity.get('relation', ''), entity.get('value', ''))
            if key not in seen:
                seen.add(key)
                unique_entities.append(entity)

        print(f"✅ Total unique entities extracted: {len(unique_entities)}")
        return unique_entities

    def analyze_document_structure(self, text: str) -> Dict:
        """Analyze document structure using LLM"""
        print("📋 Analyzing document structure using LLM...")

        prompt = self.create_structure_analysis_prompt(text)

        try:
            response = self.call_ollama_raw(prompt)
            structure = self.parse_json_response(response)

            if structure and isinstance(structure, dict):
                print("✅ Document structure analyzed successfully")
                return structure
            else:
                print("⚠️ Could not parse structure analysis")
                return {}

        except Exception as e:
            print(f"❌ Error analyzing structure: {e}")
            return {}

    def create_summary_prompt(self, entities: List[Dict], structure: Dict) -> str:
        """Create prompt for generating document summary"""

        entities_text = json.dumps(entities[:50], indent=2)  # Limit to first 50 entities
        structure_text = json.dumps(structure, indent=2)

        summary_prompt = f"""System: You are a pharmaceutical document summarizer. Based on the extracted entities and document structure, create a comprehensive summary.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences.

Format:
{{
  "executive_summary": "...",
  "medication_details": {{
    "name": "...",
    "active_ingredients": ["...", "..."],
    "therapeutic_class": "...",
    "indications": ["...", "..."],
    "dosage_forms": ["...", "..."],
    "key_dosages": ["...", "..."]
  }},
  "safety_information": {{
    "contraindications": ["...", "..."],
    "warnings": ["...", "..."],
    "common_side_effects": ["...", "..."],
    "serious_reactions": ["...", "..."]
  }},
  "administration_info": {{
    "routes": ["...", "..."],
    "dosing_schedule": "...",
    "special_populations": {{
      "pediatric": "...",
      "geriatric": "...",
      "renal_impairment": "...",
      "hepatic_impairment": "..."
    }}
  }},
  "storage_and_handling": "...",
  "manufacturer": "..."
}}

Extracted Entities:
{entities_text}

Document Structure:
{structure_text}"""

        return summary_prompt

    def generate_comprehensive_summary(self, entities: List[Dict], structure: Dict) -> Dict:
        """Generate comprehensive summary using LLM"""
        print("📝 Generating comprehensive summary...")

        prompt = self.create_summary_prompt(entities, structure)

        try:
            response = self.call_ollama_raw(prompt, extra_flags="--temperature 0.1")
            summary = self.parse_json_response(response)

            if summary and isinstance(summary, dict):
                print("✅ Summary generated successfully")
                return summary
            else:
                print("⚠️ Could not parse summary")
                return {}

        except Exception as e:
            print(f"❌ Error generating summary: {e}")
            return {}

    def process_single_document(self, pdf_path: str) -> Optional[Dict]:
        """Process a single document and return structured data"""
        print(f"\n📄 Processing: {Path(pdf_path).name}")

        if not Path(pdf_path).exists():
            print(f"❌ File not found: {pdf_path}")
            return None

        try:
            # Extract text
            raw_content = self.extract_pdf_content(pdf_path)

            if not raw_content:
                print("❌ No text content extracted")
                return None

            print(f"✅ Extracted {len(raw_content)} characters")

            # Analyze structure
            structure = self.analyze_document_structure(raw_content)

            # Extract entities
            entities = self.extract_entities_from_document(raw_content)

            # Generate summary
            summary = self.generate_comprehensive_summary(entities, structure)

            # Compile final structure
            structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "total_text_length": len(raw_content),
                    "total_entities": len(entities),
                    "model_used": self.model_name
                },
                "document_structure": structure,
                "extracted_entities": entities,
                "comprehensive_summary": summary,
                "processing_statistics": {
                    "entities_by_type": self._count_entities_by_type(entities),
                    "structure_sections": len(structure.get('main_sections', [])),
                    "processing_method": "automated_llm_analysis"
                }
            }

            print(f"✅ Successfully processed: {Path(pdf_path).name}")
            return structured_data

        except Exception as e:
            print(f"❌ Error processing {Path(pdf_path).name}: {e}")
            return None

    def get_pdf_files(self, pdf_directory: str) -> List[str]:
        """Get all PDF files from directory"""
        pdf_dir = Path(pdf_directory)

        if not pdf_dir.exists():
            raise FileNotFoundError(f"Directory not found: {pdf_directory}")

        # Find all PDF files
        pdf_files = []
        for pattern in ['*.pdf', '*.PDF']:
            pdf_files.extend(glob.glob(str(pdf_dir / pattern)))

        pdf_files.sort()
        print(f"📁 Found {len(pdf_files)} PDF files in {pdf_directory}")

        return pdf_files

    def process_batch(self,
                     pdf_directory: str = "/content/pdf",
                     output_drive_path: str = "/content/drive/MyDrive/pharma_analysis",
                     batch_size: int = 5,
                     save_individual: bool = True,
                     save_combined: bool = True) -> Dict:
        """
        Process multiple PDFs in batches

        Args:
            pdf_directory: Directory containing PDF files
            output_drive_path: Google Drive path to save results
            batch_size: Number of files to process before saving interim results
            save_individual: Save individual analysis files
            save_combined: Save combined results file
        """

        print("🚀 Starting batch processing...")
        print("=" * 70)

        # Setup
        self.processing_stats['start_time'] = datetime.now()

        # Create output directory
        output_dir = Path(output_drive_path)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Create subdirectories
        individual_dir = output_dir / "individual_analyses"
        individual_dir.mkdir(exist_ok=True)

        # Get PDF files
        try:
            pdf_files = self.get_pdf_files(pdf_directory)
        except FileNotFoundError as e:
            print(f"❌ {e}")
            return {}

        if not pdf_files:
            print("❌ No PDF files found")
            return {}

        self.processing_stats['total_files'] = len(pdf_files)

        # Process files in batches
        for i, pdf_path in enumerate(pdf_files):
            print(f"\n📊 Progress: {i+1}/{len(pdf_files)} ({(i+1)/len(pdf_files)*100:.1f}%)")

            # Process single document
            result = self.process_single_document(pdf_path)

            if result:
                file_key = Path(pdf_path).stem
                self.batch_results[file_key] = result
                self.processing_stats['successful'] += 1

                # Save individual file if requested
                if save_individual:
                    individual_file = individual_dir / f"{file_key}_analysis.json"
                    with open(individual_file, 'w', encoding='utf-8') as f:
                        json.dump(result, f, indent=2, ensure_ascii=False)
                    print(f"💾 Saved individual analysis: {individual_file.name}")

            else:
                self.failed_files.append(pdf_path)
                self.processing_stats['failed'] += 1

            # Save interim results every batch_size files
            if (i + 1) % batch_size == 0 and save_combined:
                self.save_interim_results(output_dir, i + 1)

            # Add small delay to prevent overloading
            time.sleep(1)

        # Final save
        self.processing_stats['end_time'] = datetime.now()

        if save_combined:
            final_results_path = self.save_final_results(output_dir)

        # Generate processing report
        self.generate_processing_report(output_dir)

        print(f"\n🎉 Batch processing completed!")
        self._show_batch_summary()

        return self.batch_results

    def save_interim_results(self, output_dir: Path, files_processed: int):
        """Save interim results during batch processing"""
        interim_file = output_dir / f"interim_results_{files_processed}_files.json"

        interim_data = {
            "processing_info": {
                "files_processed": files_processed,
                "successful": self.processing_stats['successful'],
                "failed": self.processing_stats['failed'],
                "interim_save_time": datetime.now().isoformat()
            },
            "results": self.batch_results,
            "failed_files": self.failed_files
        }

        with open(interim_file, 'w', encoding='utf-8') as f:
            json.dump(interim_data, f, indent=2, ensure_ascii=False)

        print(f"💾 Interim results saved: {interim_file.name}")

    def save_final_results(self, output_dir: Path) -> Path:
        """Save final combined results"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        final_file = output_dir / f"combined_pharma_analysis_{timestamp}.json"

        final_data = {
            "batch_metadata": {
                "processing_date": datetime.now().isoformat(),
                "total_files_attempted": self.processing_stats['total_files'],
                "successful_files": self.processing_stats['successful'],
                "failed_files": self.processing_stats['failed'],
                "model_used": self.model_name,
                "processing_time_minutes": self._get_processing_time_minutes()
            },
            "processing_statistics": self.processing_stats,
            "failed_files": self.failed_files,
            "document_analyses": self.batch_results
        }

        with open(final_file, 'w', encoding='utf-8') as f:
            json.dump(final_data, f, indent=2, ensure_ascii=False)

        print(f"💾 Final results saved: {final_file}")
        return final_file

    def generate_processing_report(self, output_dir: Path):
        """Generate a human-readable processing report"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_file = output_dir / f"processing_report_{timestamp}.txt"

        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("PHARMACEUTICAL DOCUMENT PROCESSING REPORT\n")
            f.write("=" * 70 + "\n\n")

            f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Model Used: {self.model_name}\n")
            f.write(f"Processing Time: {self._get_processing_time_minutes():.1f} minutes\n\n")

            f.write("SUMMARY STATISTICS:\n")
            f.write(f"- Total files attempted: {self.processing_stats['total_files']}\n")
            f.write(f"- Successfully processed: {self.processing_stats['successful']}\n")
            f.write(f"- Failed to process: {self.processing_stats['failed']}\n")
            f.write(f"- Success rate: {(self.processing_stats['successful']/self.processing_stats['total_files']*100):.1f}%\n\n")

            if self.failed_files:
                f.write("FAILED FILES:\n")
                for failed_file in self.failed_files:
                    f.write(f"- {Path(failed_file).name}\n")
                f.write("\n")

            f.write("PROCESSED FILES SUMMARY:\n")
            for file_key, data in self.batch_results.items():
                metadata = data.get('metadata', {})
                stats = data.get('processing_statistics', {})
                f.write(f"\n📄 {metadata.get('file_name', file_key)}:\n")
                f.write(f"   - Text length: {metadata.get('total_text_length', 0):,} chars\n")
                f.write(f"   - Entities extracted: {metadata.get('total_entities', 0)}\n")
                f.write(f"   - Structure sections: {stats.get('structure_sections', 0)}\n")

        print(f"📊 Processing report saved: {report_file}")

    def _count_entities_by_type(self, entities: List[Dict]) -> Dict[str, int]:
        """Count entities by relation type"""
        counts = {}
        for entity in entities:
            relation = entity.get('relation', 'unknown')
            counts[relation] = counts.get(relation, 0) + 1
        return counts

    def _get_processing_time_minutes(self) -> float:
        """Calculate processing time in minutes"""
        if self.processing_stats['start_time'] and self.processing_stats['end_time']:
            delta = self.processing_stats['end_time'] - self.processing_stats['start_time']
            return delta.total_seconds() / 60
        return 0

    def _show_batch_summary(self):
        """Show batch processing summary"""
        print("\n" + "=" * 70)
        print("📊 BATCH PROCESSING SUMMARY")
        print("=" * 70)

        print(f"🤖 Model: {self.model_name}")
        print(f"📁 Total files: {self.processing_stats['total_files']}")
        print(f"✅ Successful: {self.processing_stats['successful']}")
        print(f"❌ Failed: {self.processing_stats['failed']}")
        print(f"📈 Success rate: {(self.processing_stats['successful']/self.processing_stats['total_files']*100):.1f}%")
        print(f"⏱️ Processing time: {self._get_processing_time_minutes():.1f} minutes")

        if self.failed_files:
            print(f"\n❌ Failed files:")
            for failed_file in self.failed_files:
                print(f"   - {Path(failed_file).name}")

        print("=" * 70)

    def query_batch_results(self, question: str, file_name: Optional[str] = None) -> str:
        """Query processed batch results"""
        if not self.batch_results:
            return "❌ No documents processed. Please run batch processing first."

        if file_name:
            # Query specific file
            file_key = Path(file_name).stem
            if file_key not in self.batch_results:
                return f"❌ File not found in processed results: {file_name}"

            return self._query_single_document(self.batch_results[file_key], question)
        else:
            # Query across all documents
            return self._query_all_documents(question)

    def _query_single_document(self, document_data: Dict, question: str) -> str:
        """Query a single document"""
        # Create context from processed data
        context_parts = []

        # Add summary
        summary = document_data.get("comprehensive_summary", {})
        if summary:
            context_parts.append("DOCUMENT SUMMARY:")
            context_parts.append(json.dumps(summary, indent=2))

        # Add relevant entities
        entities = document_data.get("extracted_entities", [])
        question_words = question.lower().split()
        relevant_entities = []

        for entity in entities:
            entity_text = f"{entity.get('entity', '')} {entity.get('relation', '')} {entity.get('value', '')}".lower()
            if any(word in entity_text for word in question_words):
                relevant_entities.append(entity)

        if relevant_entities:
            context_parts.append("\nRELEVANT ENTITIES:")
            context_parts.append(json.dumps(relevant_entities[:10], indent=2))

        context = "\n".join(context_parts)

        # Create query prompt
        query_prompt = f"""System: You are a pharmaceutical document assistant. Answer the question based on the provided document context.
Be precise and cite specific information when possible.

Question: {question}

Document Context:
{context[:6000]}

Answer:"""

        try:
            response = self.call_ollama_raw(query_prompt, extra_flags="--temperature 0.1")
            return response.strip()
        except Exception as e:
            return f"❌ Error processing query: {e}"

    def _query_all_documents(self, question: str) -> str:
        """Query across all processed documents"""
        print(f"🔍 Searching across {len(self.batch_results)} documents...")

        relevant_docs = []
        question_words = question.lower().split()

        for file_key, doc_data in self.batch_results.items():
            # Check if document is relevant to question
            summary = doc_data.get("comprehensive_summary", {})
            entities = doc_data.get("extracted_entities", [])

            # Simple relevance scoring
            relevance_score = 0

            # Check summary
            summary_text = json.dumps(summary).lower()
            for word in question_words:
                relevance_score += summary_text.count(word)

            # Check entities
            for entity in entities:
                entity_text = f"{entity.get('entity', '')} {entity.get('value', '')}".lower()
                for word in question_words:
                    if word in entity_text:
                        relevance_score += 1

            if relevance_score > 0:
                relevant_docs.append((file_key, doc_data, relevance_score))

        # Sort by relevance
        relevant_docs.sort(key=lambda x: x[2], reverse=True)

        if not relevant_docs:
            return "❌ No relevant documents found for this question."

        # Create combined context from top relevant documents
        context_parts = [f"Found {len(relevant_docs)} relevant documents:\n"]

        for i, (file_key, doc_data, score) in enumerate(relevant_docs[:3]):  # Top 3 most relevant
            context_parts.append(f"\nDOCUMENT {i+1}: {doc_data['metadata']['file_name']}")

            summary = doc_data.get("comprehensive_summary", {})
            if summary:
                context_parts.append(json.dumps(summary, indent=2))

        context = "\n".join(context_parts)

        # Create cross-document query prompt
        query_prompt = f"""System: You are a pharmaceutical document assistant. Answer the question based on multiple document contexts.
Compare information across documents when relevant and cite which document(s) contain specific information.

Question: {question}

Multi-Document Context:
{context[:8000]}

Answer:"""

        try:
            response = self.call_ollama_raw(query_prompt, extra_flags="--temperature 0.1")
            return response.strip()
        except Exception as e:
            return f"❌ Error processing cross-document query: {e}"

def main():
    """Main function for batch processing in Google Colab"""
    print("🤖 Batch Pharmaceutical Document Parser for Google Colab")
    print("=" * 70)

    # Initialize parser
    parser = BatchAutomatedPharmaParser(model_name="llama3.2:3b")

    # Configuration
    PDF_DIRECTORY = "/content/drive/MyDrive/pdf"  # Local directory in Colab
    DRIVE_OUTPUT_PATH = "/content/drive/MyDrive/pharma_analysis"  # Google Drive path
    BATCH_SIZE = 3  # Process 3 files before saving interim results

    # Process batch
    try:
        results = parser.process_batch(
            pdf_directory=PDF_DIRECTORY,
            output_drive_path=DRIVE_OUTPUT_PATH,
            batch_size=BATCH_SIZE,
            save_individual=True,
            save_combined=True
        )

        if results:
            print("\n🔍 Testing batch querying...")

            # Test queries
            test_questions = [
                "Quais medicamentos têm contraindicações para crianças?",
                "Qual é a dosagem recomendada mais comum?",
                "Quais são os efeitos colaterais mais frequentes?"
            ]

            for question in test_questions:
                print(f"\nQ: {question}")
                answer = parser.query_batch_results(question)
                print(f"A: {answer[:300]}...")

            except Exception as e:
        print(f"❌ Batch processing failed: {e}")

# Additional utility functions for Google Colab

def setup_colab_environment():
    """Setup complete environment in Google Colab"""
    print("🔧 Setting up complete Google Colab environment...")

    # Install required packages
    packages_to_install = [
        "pymupdf4llm",
        "pdfplumber",
        "pathlib"
    ]

    for package in packages_to_install:
        try:
            print(f"Installing {package}...")
            os.system(f"pip install {package}")
        except Exception as e:
            print(f"Warning: Could not install {package}: {e}")

    # Install Ollama if not present
    try:
        subprocess.run(["ollama", "--version"], capture_output=True, check=True)
        print("✅ Ollama already installed")
    except:
        print("📦 Installing Ollama...")
        os.system("curl -fsSL https://ollama.ai/install.sh | sh")
        print("🚀 Starting Ollama service...")
        os.system("nohup ollama serve > /dev/null 2>&1 &")
        time.sleep(10)  # Wait for service to start

    print("✅ Environment setup complete!")

def create_sample_directory_structure():
    """Create sample directory structure for testing"""
    print("📁 Creating sample directory structure...")

    # Create PDF directory
    pdf_dir = Path("/content/pdf")
    pdf_dir.mkdir(exist_ok=True)

    # Create output directory structure
    output_dir = Path("/content/drive/MyDrive/pharma_analysis")
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"✅ Created directories:")
    print(f"   - PDF input: {pdf_dir}")
    print(f"   - Drive output: {output_dir}")

    return str(pdf_dir), str(output_dir)

def process_pharma_pdfs_batch(pdf_directory: str = "/content/pdf",
                            output_drive_path: str = "/content/drive/MyDrive/pharma_analysis",
                            model_name: str = "llama3.2:3b",
                            batch_size: int = 3):
    """
    Convenient function to process pharmaceutical PDFs in batch

    Usage in Google Colab:
    ```python
    # Upload PDFs to /content/pdf directory first
    results = process_pharma_pdfs_batch()
    ```
    """

    # Setup environment
    setup_colab_environment()

    # Initialize parser
    parser = BatchAutomatedPharmaParser(model_name=model_name)

    # Process batch
    results = parser.process_batch(
        pdf_directory=pdf_directory,
        output_drive_path=output_drive_path,
        batch_size=batch_size,
        save_individual=True,
        save_combined=True
    )

    return parser, results

def quick_query(parser, question: str, file_name: Optional[str] = None):
    """Quick query function for interactive use"""
    if not hasattr(parser, 'batch_results') or not parser.batch_results:
        print("❌ No processed documents available")
        return None

    answer = parser.query_batch_results(question, file_name)
    print(f"\n❓ Question: {question}")
    if file_name:
        print(f"📄 Document: {file_name}")
    print(f"💬 Answer: {answer}")
    return answer

def list_processed_files(parser):
    """List all processed files"""
    if not hasattr(parser, 'batch_results') or not parser.batch_results:
        print("❌ No processed documents available")
        return []

    print("📁 Processed files:")
    files = []
    for file_key, data in parser.batch_results.items():
        file_name = data['metadata']['file_name']
        entity_count = data['metadata']['total_entities']
        files.append(file_name)
        print(f"   - {file_name} ({entity_count} entities)")

    return files

def export_entities_to_csv(parser, output_path: str = "/content/drive/MyDrive/pharma_analysis/all_entities.csv"):
    """Export all entities to CSV for analysis"""
    if not hasattr(parser, 'batch_results') or not parser.batch_results:
        print("❌ No processed documents available")
        return None

    import pandas as pd

    all_entities = []
    for file_key, data in parser.batch_results.items():
        file_name = data['metadata']['file_name']
        entities = data.get('extracted_entities', [])

        for entity in entities:
            entity_row = {
                'source_file': file_name,
                'entity': entity.get('entity', ''),
                'relation': entity.get('relation', ''),
                'value': entity.get('value', ''),
                'processing_date': data['metadata']['processing_date']
            }
            all_entities.append(entity_row)

    df = pd.DataFrame(all_entities)
    df.to_csv(output_path, index=False, encoding='utf-8')

    print(f"📊 Exported {len(all_entities)} entities to: {output_path}")
    print(f"📈 Entity types distribution:")
    print(df['relation'].value_counts().head(10))

    return output_path

# Google Colab specific helper functions

def upload_pdfs_to_colab():
    """Helper to upload PDFs in Google Colab"""
    from google.colab import files

    print("📤 Upload your PDF files...")
    uploaded = files.upload()

    # Move uploaded files to pdf directory
    pdf_dir = Path("/content/pdf")
    pdf_dir.mkdir(exist_ok=True)

    moved_files = []
    for filename in uploaded.keys():
        if filename.lower().endswith('.pdf'):
            src = Path(f"/content/{filename}")
            dst = pdf_dir / filename
            src.rename(dst)
            moved_files.append(str(dst))
            print(f"📁 Moved {filename} to pdf directory")

    print(f"✅ Ready to process {len(moved_files)} PDF files")
    return moved_files

def download_results_from_drive():
    """Helper to download results from Google Drive"""
    from google.colab import files

    results_dir = Path("/content/drive/MyDrive/pharma_analysis")
    if not results_dir.exists():
        print("❌ No results directory found")
        return

    # Find latest combined results file
    json_files = list(results_dir.glob("combined_pharma_analysis_*.json"))
    if json_files:
        latest_file = max(json_files, key=lambda f: f.stat().st_mtime)
        print(f"📥 Downloading latest results: {latest_file.name}")
        files.download(str(latest_file))

    # Download processing report
    report_files = list(results_dir.glob("processing_report_*.txt"))
    if report_files:
        latest_report = max(report_files, key=lambda f: f.stat().st_mtime)
        print(f"📥 Downloading latest report: {latest_report.name}")
        files.download(str(latest_report))

# Example usage for Google Colab

Google Colab Usage Example:

# 1. Setup and upload files
setup_colab_environment()
create_sample_directory_structure()
upload_pdfs_to_colab()  # Upload your PDFs

# 2. Process all PDFs
parser, results = process_pharma_pdfs_batch(
    pdf_directory="/content/pdf",
    output_drive_path="/content/drive/MyDrive/pharma_analysis",
    batch_size=3
)

# 3. Query results
list_processed_files(parser)
quick_query(parser, "Qual é a dosagem recomendada?")
quick_query(parser, "Quais são as contraindicações?", "specific_file.pdf")

# 4. Export data
export_entities_to_csv(parser)

# 5. Download results
download_results_from_drive()
"""

if __name__ == "__main__":
    # For direct execution in Google Colab
    if 'google.colab' in str(get_ipython()):
        print("🔍 Detected Google Colab environment")
        print("Run the following commands to get started:")
        print("1. setup_colab_environment()")
        print("2. upload_pdfs_to_colab() or place PDFs in /content/pdf/")
        print("3. parser, results = process_pharma_pdfs_batch()")
    else:
        # Run normally for local execution
        main()
        """

SyntaxError: invalid syntax (ipython-input-2891258971.py, line 808)

In [None]:
#!/usr/bin/env python3
"""
Improved Batch Automated Pharmaceutical Document Parser with Google Drive Integration
Fixed version with timeout handling, progress saving, and optimized prompts
Optimized for Google Colab environment
"""

import json
import re
import subprocess
import shlex
import os
import glob
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import pymupdf4llm
import pdfplumber
from google.colab import drive
import time
import pickle

class ImprovedBatchPharmaParser:
    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize improved batch parser with better timeout handling
        """
        self.model_name = model_name
        self.batch_results = {}
        self.failed_files = []
        self.processing_stats = {
            'total_files': 0,
            'successful': 0,
            'failed': 0,
            'start_time': None,
            'end_time': None
        }
        self.checkpoint_file = None
        self.setup_environment()

    def setup_environment(self):
        """Setup Google Colab environment"""
        print("🔧 Setting up improved Google Colab environment...")

        # Mount Google Drive
        try:
            drive.mount('/content/drive')
            print("✅ Google Drive mounted successfully")
        except Exception as e:
            print(f"⚠️ Drive mount warning: {e}")

        # Setup Ollama with better configuration
        self.setup_ollama_improved()

    def setup_ollama_improved(self):
        """Setup Ollama with better configuration"""
        print(f"Setting up Ollama model: {self.model_name}")

        try:
            # Check if ollama is running
            subprocess.run(["ollama", "list"], capture_output=True, check=True, timeout=10)
            print("✅ Ollama service is running")
        except:
            print("🚀 Starting Ollama service...")
            os.system("nohup ollama serve > /tmp/ollama.log 2>&1 &")
            time.sleep(10)

        try:
            # Check if model exists
            result = subprocess.run(
                ["ollama", "list"],
                capture_output=True,
                text=True,
                timeout=30
            )

            if self.model_name in result.stdout:
                print(f"✅ Model {self.model_name} already available")
            else:
                print(f"📥 Pulling model {self.model_name}...")
                subprocess.run(
                    ["ollama", "pull", self.model_name],
                    timeout=600,  # 10 minutes for model download
                    check=True
                )
                print(f"✅ Model {self.model_name} ready")

        except subprocess.TimeoutExpired:
            print("⚠️ Model setup timed out, continuing anyway...")
        except Exception as e:
            print(f"⚠️ Model setup error: {e}")

    def call_ollama_with_retry(self, prompt: str, max_retries: int = 3, timeout: int = 180) -> str:
        """Call ollama with retry logic and better timeout handling"""

        for attempt in range(max_retries):
            try:
                print(f"  Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")

                # Use simpler command structure
                cmd = ["ollama", "run", self.model_name, "--"]

                proc = subprocess.run(
                    cmd,
                    input=prompt,
                    text=True,
                    capture_output=True,
                    timeout=timeout
                )

                output = proc.stdout.strip()
                if output:
                    return output
                elif proc.stderr:
                    print(f"  Warning: {proc.stderr.strip()}")

                # If no output, try again with longer timeout
                timeout = min(timeout * 1.5, 300)  # Max 5 minutes

            except subprocess.TimeoutExpired:
                print(f"  Timeout after {timeout}s, retrying...")
                timeout = min(timeout * 1.5, 300)
                continue
            except Exception as e:
                print(f"  Error on attempt {attempt + 1}: {e}")
                if attempt == max_retries - 1:
                    raise RuntimeError(f"All {max_retries} attempts failed")
                time.sleep(2)
                continue

        raise RuntimeError("All retry attempts failed")

    def extract_pdf_content_safe(self, pdf_path: str) -> str:
        """Extract text content with better error handling"""
        try:
            print(f"  📖 Extracting text from {Path(pdf_path).name}")

            # Try pdfplumber first (usually faster)
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for i, page in enumerate(pdf.pages[:50]):  # Limit to first 50 pages
                    try:
                        page_text = page.extract_text()
                        if page_text:
                            all_text.append(page_text)
                    except Exception as e:
                        print(f"    Warning: Page {i+1} extraction failed: {e}")
                        continue

                if all_text:
                    content = "\n\n".join(all_text)
                    # Limit content size to prevent huge prompts
                    if len(content) > 50000:  # 50k characters max
                        content = content[:50000] + "\n[CONTENT TRUNCATED]"
                    return content

        except Exception as e:
            print(f"    pdfplumber failed: {e}")

        try:
            # Fallback to pymupdf4llm
            print("    Trying pymupdf4llm...")
            content = pymupdf4llm.to_markdown(pdf_path)
            if len(content) > 50000:
                content = content[:50000] + "\n[CONTENT TRUNCATED]"
            return content
        except Exception as e:
            print(f"    pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def create_simple_entity_prompt(self, text: str) -> str:
        """Create simplified prompt for better success rate"""

        # Much simpler prompt that's more likely to succeed
        simple_prompt = f"""Extract key pharmaceutical information as JSON array. Only return valid JSON, no other text.

Format: [{{"name": "medication name", "type": "category", "value": "details"}}]

Categories: medication, dosage, indication, contraindication, side_effect, manufacturer, storage

Text (first 2000 chars):
{text[:2000]}

JSON:"""

        return simple_prompt

    def create_simple_summary_prompt(self, entities: List[Dict]) -> str:
        """Create simplified summary prompt"""

        entities_text = json.dumps(entities[:20], indent=1)  # Limit entities

        summary_prompt = f"""Create medication summary as JSON. Only return valid JSON, no other text.

Format:
{{"medication_name": "...", "main_use": "...", "key_warnings": "...", "dosage_info": "..."}}

Entities:
{entities_text}

JSON:"""

        return summary_prompt

    def process_single_document_improved(self, pdf_path: str) -> Optional[Dict]:
        """Improved document processing with better error handling"""
        print(f"\n📄 Processing: {Path(pdf_path).name}")

        start_time = time.time()

        try:
            # Extract text with size limits
            raw_content = self.extract_pdf_content_safe(pdf_path)

            if not raw_content:
                print("❌ No text content extracted")
                return None

            print(f"✅ Extracted {len(raw_content):,} characters")

            # Step 1: Simple entity extraction
            entities = []
            try:
                print("🔍 Extracting entities (simplified approach)...")
                entity_prompt = self.create_simple_entity_prompt(raw_content)

                response = self.call_ollama_with_retry(entity_prompt, max_retries=2, timeout=120)
                parsed_entities = self.parse_json_response_safe(response)

                if parsed_entities and isinstance(parsed_entities, list):
                    entities = parsed_entities
                    print(f"✅ Extracted {len(entities)} entities")
                else:
                    print("⚠️ Entity extraction returned no valid results")

            except Exception as e:
                print(f"⚠️ Entity extraction failed: {e}")

            # Step 2: Simple summary
            summary = {}
            try:
                if entities:
                    print("📝 Generating summary...")
                    summary_prompt = self.create_simple_summary_prompt(entities)

                    response = self.call_ollama_with_retry(summary_prompt, max_retries=2, timeout=60)
                    parsed_summary = self.parse_json_response_safe(response)

                    if parsed_summary and isinstance(parsed_summary, dict):
                        summary = parsed_summary
                        print("✅ Summary generated")
                    else:
                        print("⚠️ Summary generation failed")

            except Exception as e:
                print(f"⚠️ Summary generation failed: {e}")

            # Create result structure (simpler than original)
            processing_time = time.time() - start_time

            structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "text_length": len(raw_content),
                    "entity_count": len(entities),
                    "model_used": self.model_name,
                    "processing_time_seconds": round(processing_time, 2)
                },
                "entities": entities,
                "summary": summary,
                "status": "success"
            }

            print(f"✅ Successfully processed in {processing_time:.1f}s: {Path(pdf_path).name}")
            return structured_data

        except Exception as e:
            print(f"❌ Error processing {Path(pdf_path).name}: {e}")
            return {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "error": str(e),
                    "model_used": self.model_name
                },
                "status": "failed"
            }

    def parse_json_response_safe(self, response: str) -> Any:
        """Safer JSON parsing with multiple fallback strategies"""
        if not response:
            return None

        cleaned = response.strip()

        # Remove code fences
        if "```" in cleaned:
            lines = cleaned.split('\n')
            json_lines = []
            in_code_block = False
            for line in lines:
                if line.strip().startswith("```"):
                    in_code_block = not in_code_block
                    continue
                if not in_code_block:
                    json_lines.append(line)
            cleaned = '\n'.join(json_lines)

        # Find JSON boundaries
        json_start = -1
        json_end = -1

        # Look for array or object start
        for i, char in enumerate(cleaned):
            if char in '[{':
                json_start = i
                break

        # Look for corresponding end from the back
        if json_start != -1:
            bracket_count = 0
            start_char = cleaned[json_start]
            end_char = ']' if start_char == '[' else '}'

            for i in range(json_start, len(cleaned)):
                if cleaned[i] == start_char:
                    bracket_count += 1
                elif cleaned[i] == end_char:
                    bracket_count -= 1
                    if bracket_count == 0:
                        json_end = i
                        break

        if json_start != -1 and json_end != -1:
            json_str = cleaned[json_start:json_end+1]

            try:
                return json.loads(json_str)
            except json.JSONDecodeError:
                # Try to fix common JSON issues
                json_str = json_str.replace("'", '"')  # Single to double quotes
                json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
                json_str = re.sub(r',\s*]', ']', json_str)

                try:
                    return json.loads(json_str)
                except json.JSONDecodeError as e:
                    print(f"JSON parse error: {e}")
                    print(f"Problematic JSON: {json_str[:200]}...")
                    return None

        print("Could not find valid JSON in response")
        return None

    def save_checkpoint(self, output_dir: Path, current_index: int):
        """Save current progress to avoid losing work"""
        checkpoint_data = {
            'batch_results': self.batch_results,
            'failed_files': self.failed_files,
            'processing_stats': self.processing_stats,
            'current_index': current_index,
            'timestamp': datetime.now().isoformat()
        }

        checkpoint_file = output_dir / "processing_checkpoint.pkl"
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(checkpoint_data, f)

        print(f"💾 Checkpoint saved at file {current_index}")

    def load_checkpoint(self, output_dir: Path) -> int:
        """Load previous progress if available"""
        checkpoint_file = output_dir / "processing_checkpoint.pkl"

        if checkpoint_file.exists():
            try:
                with open(checkpoint_file, 'rb') as f:
                    checkpoint_data = pickle.load(f)

                self.batch_results = checkpoint_data.get('batch_results', {})
                self.failed_files = checkpoint_data.get('failed_files', [])
                self.processing_stats = checkpoint_data.get('processing_stats', self.processing_stats)

                current_index = checkpoint_data.get('current_index', 0)
                print(f"📂 Loaded checkpoint: resuming from file {current_index + 1}")
                return current_index + 1

            except Exception as e:
                print(f"⚠️ Could not load checkpoint: {e}")

        return 0

    def process_batch_improved(self,
                             pdf_directory: str = "/content/drive/MyDrive/pdf",
                             output_drive_path: str = "/content/drive/MyDrive/pharma_analysis",
                             batch_size: int = 5,
                             max_files: Optional[int] = None,
                             resume: bool = True) -> Dict:
        """
        Improved batch processing with checkpoint support

        Args:
            pdf_directory: Directory containing PDF files
            output_drive_path: Google Drive path to save results
            batch_size: Number of files to process before saving checkpoint
            max_files: Maximum number of files to process (None for all)
            resume: Resume from checkpoint if available
        """

        print("🚀 Starting improved batch processing...")
        print("=" * 70)

        # Setup
        self.processing_stats['start_time'] = datetime.now()

        # Create output directory
        output_dir = Path(output_drive_path)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Create subdirectories
        individual_dir = output_dir / "individual_analyses"
        individual_dir.mkdir(exist_ok=True)

        # Load checkpoint if resuming
        start_index = 0
        if resume:
            start_index = self.load_checkpoint(output_dir)

        # Get PDF files
        try:
            pdf_files = self.get_pdf_files(pdf_directory)
        except FileNotFoundError as e:
            print(f"❌ {e}")
            return {}

        if not pdf_files:
            print("❌ No PDF files found")
            return {}

        # Limit files if specified
        if max_files:
            pdf_files = pdf_files[:max_files]

        self.processing_stats['total_files'] = len(pdf_files)

        print(f"📁 Processing {len(pdf_files)} files starting from index {start_index}")

        # Process files starting from checkpoint
        for i in range(start_index, len(pdf_files)):
            pdf_path = pdf_files[i]

            print(f"\n📊 Progress: {i+1}/{len(pdf_files)} ({(i+1)/len(pdf_files)*100:.1f}%)")
            print(f"⏱️ Estimated time remaining: {self._estimate_time_remaining(i, len(pdf_files))} minutes")

            # Process single document
            result = self.process_single_document_improved(pdf_path)

            if result and result.get('status') == 'success':
                file_key = Path(pdf_path).stem
                self.batch_results[file_key] = result
                self.processing_stats['successful'] += 1

                # Save individual file
                individual_file = individual_dir / f"{file_key}_analysis.json"
                with open(individual_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)

            else:
                self.failed_files.append(pdf_path)
                self.processing_stats['failed'] += 1

            # Save checkpoint every batch_size files
            if (i + 1) % batch_size == 0:
                self.save_checkpoint(output_dir, i)

            # Show current stats
            success_rate = (self.processing_stats['successful'] / (i + 1)) * 100
            print(f"📈 Current success rate: {success_rate:.1f}%")

            # Add delay to prevent overloading
            time.sleep(2)

        # Final save
        self.processing_stats['end_time'] = datetime.now()
        final_results_path = self.save_final_results_improved(output_dir)
        self.generate_processing_report_improved(output_dir)

        print(f"\n🎉 Batch processing completed!")
        self._show_batch_summary()

        return self.batch_results

    def get_pdf_files(self, pdf_directory: str) -> List[str]:
        """Get all PDF files from directory with better validation"""
        pdf_dir = Path(pdf_directory)

        if not pdf_dir.exists():
            raise FileNotFoundError(f"Directory not found: {pdf_directory}")

        # Find all PDF files
        pdf_files = []
        for pattern in ['*.pdf', '*.PDF']:
            found_files = glob.glob(str(pdf_dir / pattern))
            pdf_files.extend(found_files)

        # Filter out very small or very large files
        valid_files = []
        for pdf_file in pdf_files:
            try:
                file_size = Path(pdf_file).stat().st_size
                if 1000 < file_size < 50_000_000:  # Between 1KB and 50MB
                    valid_files.append(pdf_file)
                else:
                    print(f"⚠️ Skipping {Path(pdf_file).name} (size: {file_size:,} bytes)")
            except Exception as e:
                print(f"⚠️ Error checking {Path(pdf_file).name}: {e}")

        valid_files.sort()
        print(f"📁 Found {len(valid_files)} valid PDF files in {pdf_directory}")

        return valid_files

    def save_final_results_improved(self, output_dir: Path) -> Path:
        """Save final results with better organization"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        final_file = output_dir / f"batch_analysis_results_{timestamp}.json"

        # Create summary statistics
        entity_counts = {}
        total_entities = 0

        for file_key, data in self.batch_results.items():
            entities = data.get('entities', [])
            total_entities += len(entities)

            for entity in entities:
                entity_type = entity.get('type', 'unknown')
                entity_counts[entity_type] = entity_counts.get(entity_type, 0) + 1

        final_data = {
            "batch_info": {
                "processing_date": datetime.now().isoformat(),
                "model_used": self.model_name,
                "total_files_attempted": self.processing_stats['total_files'],
                "successful_files": self.processing_stats['successful'],
                "failed_files": self.processing_stats['failed'],
                "success_rate_percent": round((self.processing_stats['successful']/self.processing_stats['total_files'])*100, 1),
                "total_processing_time_minutes": self._get_processing_time_minutes(),
                "total_entities_extracted": total_entities
            },
            "entity_statistics": entity_counts,
            "failed_file_list": [Path(f).name for f in self.failed_files],
            "document_results": self.batch_results
        }

        with open(final_file, 'w', encoding='utf-8') as f:
            json.dump(final_data, f, indent=2, ensure_ascii=False)

        print(f"💾 Final results saved: {final_file}")
        return final_file

    def generate_processing_report_improved(self, output_dir: Path):
        """Generate improved processing report"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_file = output_dir / f"processing_report_{timestamp}.md"

        with open(report_file, 'w', encoding='utf-8') as f:
            f.write("# Pharmaceutical Document Processing Report\n\n")

            f.write(f"**Processing Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"**Model Used:** {self.model_name}\n")
            f.write(f"**Processing Time:** {self._get_processing_time_minutes():.1f} minutes\n\n")

            f.write("## Summary Statistics\n\n")
            total = self.processing_stats['total_files']
            successful = self.processing_stats['successful']
            failed = self.processing_stats['failed']

            f.write(f"- **Total files:** {total}\n")
            f.write(f"- **Successfully processed:** {successful} ({successful/total*100:.1f}%)\n")
            f.write(f"- **Failed:** {failed} ({failed/total*100:.1f}%)\n\n")

            if self.failed_files:
                f.write("## Failed Files\n\n")
                for failed_file in self.failed_files:
                    f.write(f"- {Path(failed_file).name}\n")
                f.write("\n")

            f.write("## Successfully Processed Files\n\n")
            for file_key, data in self.batch_results.items():
                metadata = data.get('metadata', {})
                f.write(f"### {metadata.get('file_name', file_key)}\n\n")
                f.write(f"- **Text length:** {metadata.get('text_length', 0):,} characters\n")
                f.write(f"- **Entities extracted:** {metadata.get('entity_count', 0)}\n")
                f.write(f"- **Processing time:** {metadata.get('processing_time_seconds', 0):.1f}s\n\n")

        print(f"📊 Processing report saved: {report_file}")

    def _estimate_time_remaining(self, current_index: int, total_files: int) -> float:
        """Estimate time remaining based on current progress"""
        if current_index == 0:
            return 0

        elapsed_time = (datetime.now() - self.processing_stats['start_time']).total_seconds() / 60
        avg_time_per_file = elapsed_time / (current_index + 1)
        remaining_files = total_files - current_index - 1

        return avg_time_per_file * remaining_files

    def _get_processing_time_minutes(self) -> float:
        """Calculate processing time in minutes"""
        if self.processing_stats['start_time'] and self.processing_stats['end_time']:
            delta = self.processing_stats['end_time'] - self.processing_stats['start_time']
            return delta.total_seconds() / 60
        return 0

    def _show_batch_summary(self):
        """Show improved batch processing summary"""
        print("\n" + "=" * 70)
        print("📊 BATCH PROCESSING SUMMARY")
        print("=" * 70)

        total = self.processing_stats['total_files']
        successful = self.processing_stats['successful']
        failed = self.processing_stats['failed']

        print(f"🤖 Model: {self.model_name}")
        print(f"📁 Total files: {total}")
        print(f"✅ Successful: {successful} ({successful/total*100:.1f}%)")
        print(f"❌ Failed: {failed} ({failed/total*100:.1f}%)")
        print(f"⏱️ Total time: {self._get_processing_time_minutes():.1f} minutes")

        if successful > 0:
            avg_time = self._get_processing_time_minutes() / successful
            print(f"📈 Average time per file: {avg_time:.1f} minutes")

        total_entities = sum(len(data.get('entities', [])) for data in self.batch_results.values())
        print(f"🔍 Total entities extracted: {total_entities:,}")

        if self.failed_files:
            print(f"\n❌ Failed files ({len(self.failed_files)}):")
            for failed_file in self.failed_files[:5]:  # Show first 5
                print(f"   - {Path(failed_file).name}")
            if len(self.failed_files) > 5:
                print(f"   ... and {len(self.failed_files) - 5} more")

        print("=" * 70)

    def quick_search(self, query: str, limit: int = 5) -> List[Dict]:
        """Quick search across processed results"""
        if not self.batch_results:
            print("❌ No processed results available")
            return []

        query_words = query.lower().split()
        results = []

        for file_key, data in self.batch_results.items():
            score = 0
            matches = []

            # Search in entities
            entities = data.get('entities', [])
            for entity in entities:
                entity_text = f"{entity.get('name', '')} {entity.get('value', '')}".lower()
                for word in query_words:
                    if word in entity_text:
                        score += 1
                        matches.append(entity)

            # Search in summary
            summary = data.get('summary', {})
            summary_text = json.dumps(summary).lower()
            for word in query_words:
                score += summary_text.count(word)

            if score > 0:
                results.append({
                    'file_name': data['metadata']['file_name'],
                    'score': score,
                    'relevant_entities': matches[:3],  # Top 3 matches
                    'summary': summary
                })

        # Sort by relevance
        results.sort(key=lambda x: x['score'], reverse=True)
        return results[:limit]


# Improved utility functions

def setup_improved_environment():
    """Setup improved environment for Colab"""
    print("🔧 Setting up improved environment...")

    # Install packages
    packages = ["pymupdf4llm", "pdfplumber"]
    for package in packages:
        try:
            os.system(f"pip install -q {package}")
        except Exception as e:
            print(f"Warning: {package} installation issue: {e}")

    # Check Ollama
    try:
        result = subprocess.run(["ollama", "--version"], capture_output=True, timeout=10)
        if result.returncode == 0:
            print("✅ Ollama available")
        else:
            print("📦 Installing Ollama...")
            os.system("curl -fsSL https://ollama.ai/install.sh | sh")
    except:
        print("📦 Installing Ollama...")
        os.system("curl -fsSL https://ollama.ai/install.sh | sh")

    print("✅ Environment ready!")

def process_pharma_batch_safe(pdf_directory: str = "/content/drive/MyDrive/pdf",
                            output_path: str = "/content/drive/MyDrive/pharma_analysis",
                            max_files: int = 10,
                            model: str = "llama3.2:3b"):
    """
    Safe batch processing function with limits

    Args:
        pdf_directory: Input directory with PDFs
        output_path: Output directory on Google Drive
        max_files: Maximum files to process (to prevent overload)
        model: Ollama model to use
    """

    print(f"🚀 Starting safe batch processing (max {max_files} files)")

    parser = ImprovedBatchPharmaParser(model_name=model)

    results = parser.process_batch_improved(
        pdf_directory=pdf_directory,
        output_drive_path=output_path,
        batch_size=3,  # Save progress every 3 files
        max_files=max_files,
        resume=True  # Resume from checkpoint if available
    )

    return parser, results

def search_results(parser, query: str):
    """Search through processed results"""
    if not hasattr(parser, 'batch_results'):
        print("❌ No processed results available")
        return

    results = parser.quick_search(query, limit=5)

    if not results:
        print(f"❌ No results found for: {query}")
        return

    print(f"🔍 Search results for '{query}':")
    print("=" * 50)

    for i, result in enumerate(results, 1):
        print(f"\n{i}. **{result['file_name']}** (relevance: {result['score']})")

        # Show relevant entities
        if result['relevant_entities']:
            print("   Relevant info:")
            for entity in result['relevant_entities']:
                print(f"   - {entity.get('name', 'N/A')}: {entity.get('value', 'N/A')}")

        # Show summary if available
        summary = result.get('summary', {})
        if summary and summary.get('medication_name'):
            print(f"   Medication: {summary.get('medication_name')}")
            print(f"   Use: {summary.get('main_use', 'N/A')}")

def show_processing_stats(parser):
    """Show detailed processing statistics"""
    if not hasattr(parser, 'processing_stats'):
        print("❌ No processing stats available")
        return

    stats = parser.processing_stats
    results = parser.batch_results

    print("📊 PROCESSING STATISTICS")
    print("=" * 40)
    print(f"Files processed: {stats['successful']}/{stats['total_files']}")
    print(f"Success rate: {(stats['successful']/stats['total_files']*100):.1f}%")
    print(f"Total time: {parser._get_processing_time_minutes():.1f} minutes")

    if results:
        total_entities = sum(len(data.get('entities', [])) for data in results.values())
        avg_entities = total_entities / len(results) if results else 0
        print(f"Total entities: {total_entities:,}")
        print(f"Average entities per file: {avg_entities:.1f}")

# Example usage for Google Colab
if __name__ == "__main__":

    # Step 1: Setup environment
    setup_improved_environment()

    # Step 2: Process documents safely (start with small batch)
    print("\n🚀 Starting pharmaceutical document processing...")
    print("=" * 70)

    # Process first 5 files as a test
    parser, results = process_pharma_batch_safe(
        pdf_directory="/content/drive/MyDrive/pdf",
        output_path="/content/drive/MyDrive/pharma_analysis",
        max_files=5,  # Start small to test
        model="llama3.2:3b"
    )

    # Step 3: Show results
    show_processing_stats(parser)

    # Step 4: Test search functionality
    if results:
        print("\n🔍 Testing search functionality...")
        search_results(parser, "dosagem")
        search_results(parser, "contraindicação")
        search_results(parser, "efeitos colaterais")

    print("\n✅ Processing complete! Check your Google Drive for results.")

# Additional helper functions

def continue_processing(parser, max_additional_files: int = 20):
    """Continue processing more files from where we left off"""
    return parser.process_batch_improved(
        max_files=parser.processing_stats['total_files'] + max_additional_files,
        resume=True
    )

def export_to_excel(parser, output_path: str = "/content/drive/MyDrive/pharma_analysis/entities.xlsx"):
    """Export all entities to Excel file"""
    try:
        import pandas as pd

        all_entities = []
        for file_key, data in parser.batch_results.items():
            file_name = data['metadata']['file_name']
            entities = data.get('entities', [])

            for entity in entities:
                all_entities.append({
                    'source_file': file_name,
                    'entity_name': entity.get('name', ''),
                    'entity_type': entity.get('type', ''),
                    'entity_value': entity.get('value', ''),
                    'processing_date': data['metadata']['processing_date']
                })

        df = pd.DataFrame(all_entities)
        df.to_excel(output_path, index=False)

        print(f"📊 Exported {len(all_entities)} entities to Excel: {output_path}")
        return output_path

    except ImportError:
        print("❌ pandas not available. Install with: !pip install pandas openpyxl")
    except Exception as e:
        print(f"❌ Export failed: {e}")

def cleanup_checkpoints(output_dir: str = "/content/drive/MyDrive/pharma_analysis"):
    """Clean up checkpoint files"""
    output_path = Path(output_dir)
    checkpoint_files = list(output_path.glob("processing_checkpoint.pkl"))
    interim_files = list(output_path.glob("interim_results_*.json"))

    for file in checkpoint_files + interim_files:
        try:
            file.unlink()
            print(f"🗑️ Removed checkpoint: {file.name}")
        except Exception as e:
            print(f"Warning: Could not remove {file.name}: {e}")

print("🎉 Improved Pharmaceutical Document Parser loaded!")
print("📝 Usage:")
print("1. setup_improved_environment()")
print("2. parser, results = process_pharma_batch_safe(max_files=5)")
print("3. search_results(parser, 'your_search_term')")
print("4. export_to_excel(parser)")

🔧 Setting up improved environment...
✅ Ollama available
✅ Environment ready!

🚀 Starting pharmaceutical document processing...
🚀 Starting safe batch processing (max 5 files)
🔧 Setting up improved Google Colab environment...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully
Setting up Ollama model: llama3.2:3b
🚀 Starting Ollama service...
✅ Model llama3.2:3b already available
🚀 Starting improved batch processing...
📁 Found 341 valid PDF files in /content/drive/MyDrive/pdf
📁 Processing 5 files starting from index 0

📊 Progress: 1/5 (20.0%)
⏱️ Estimated time remaining: 0 minutes

📄 Processing: bula_1755192077396.pdf
  📖 Extracting text from bula_1755192077396.pdf
✅ Extracted 11,936 characters
🔍 Extracting entities (simplified approach)...
  Attempt 1/2 (timeout: 120s)
  Timeout after 120s, retrying...
  Attempt 2/2 (timeout: 180.0s)
  Timeout after 180.0s, retrying...
⚠️ Enti

helloouu

In [None]:
#!/usr/bin/env python3
"""
Enhanced Pharmaceutical Knowledge Graph Extractor for Google Colab - Phrase-Based JSON Processing

Processes phrase-optimized JSON files from the enhanced pharmaceutical document parser.
Optimized for small language models with robust extraction and error handling.
Includes detailed logging of prompts and responses for debugging model performance.
"""

import json
import os
import re
import requests
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import logging
from google.colab import drive

# ==============================================================================
# CORE LOGIC: Enhanced PharmaceuticalKnowledgeExtractor CLASS
# ==============================================================================

class EnhancedPharmaceuticalKnowledgeExtractor:
    def __init__(self,
                 model_name: str = "llama3:8b", # Default is now 8B
                 ollama_url: str = "http://localhost:11434/api/generate",
                 max_retries: int = 3,
                 request_delay: float = 0.5):
        """
        Initialize the enhanced knowledge extractor optimized for phrase-based JSON files.
        """
        self.model_name = model_name
        self.ollama_url = ollama_url
        self.max_retries = max_retries
        self.request_delay = request_delay

        self.stats = {
            'files_processed': 0,
            'phrase_blocks_processed': 0,
            'table_blocks_processed': 0,
            'phrases_processed': 0,
            'successful_extractions': 0,
            'failed_extractions': 0,
            'total_triples': 0,
            'skipped_irrelevant': 0
        }

        # Enhanced patterns for better pharmaceutical content detection
        self.pharma_keywords = [
            'mg', 'ml', 'g/', 'mcg', 'μg', '%', 'dose', 'dosagem', 'posologia',
            'comprimido', 'cápsula', 'medicamento', 'fármaco', 'droga',
            'indicação', 'indicado', 'tratamento', 'terapia',
            'contraindicação', 'contraindicado', 'não usar', 'evitar',
            'efeito', 'reação', 'adverso', 'colateral', 'indesejável',
            'alergia', 'hipersensibilidade', 'intolerância',
            'administração', 'aplicar', 'tomar', 'ingerir',
            'composição', 'princípio ativo', 'substância', 'excipiente',
            'interação', 'interagir', 'incompatível', 'interferir',
            'gravidez', 'gestação', 'lactação', 'amamentação',
            'criança', 'pediátrico', 'adulto', 'idoso', 'geriátrico'
        ]

        self._setup_logging()
        self._test_ollama_connection()

    def _setup_logging(self):
        """Setup enhanced logging configuration."""
        # Remove existing handlers to avoid duplicates in Colab
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        # General logger for progress and errors
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('enhanced_pharma_extraction.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

        # Dedicated logger for prompts and responses
        self.prompt_logger = logging.getLogger('prompt_logger')
        self.prompt_logger.setLevel(logging.INFO)
        prompt_handler = logging.FileHandler('enhanced_prompts_and_responses.log', mode='w')
        prompt_formatter = logging.Formatter('%(message)s')
        prompt_handler.setFormatter(prompt_formatter)

        # Avoid adding handlers if they already exist
        if not self.prompt_logger.handlers:
            self.prompt_logger.addHandler(prompt_handler)

    def _test_ollama_connection(self):
        """Test connection to Ollama API with enhanced error reporting."""
        try:
            test_payload = {
                "model": self.model_name,
                "prompt": "Teste de conexão. Responda apenas 'OK'.",
                "stream": False,
                "format": "json",
                "options": {"temperature": 0.0, "num_predict": 10}
            }
            response = requests.post(self.ollama_url, json=test_payload, timeout=15)
            if response.status_code == 200:
                self.logger.info(f"✅ Successfully connected to Ollama with {self.model_name}")
                result = response.json()
                self.logger.debug(f"Test response: {result.get('response', 'No response')}")
            else:
                self.logger.warning(f"⚠️ Ollama connection test failed: {response.status_code} - {response.text}")
        except requests.exceptions.ConnectionError:
            self.logger.error("❌ Cannot connect to Ollama API. Ensure it's running and accessible from this Colab notebook.")
            raise ConnectionError("Cannot connect to Ollama API. Ensure it's running and accessible (e.g., via ngrok).")
        except Exception as e:
            self.logger.error(f"❌ Failed to connect to Ollama: {e}")
            raise ConnectionError(f"Ollama connection failed: {e}")

    def _call_ollama_api(self, prompt: str, max_tokens: int = 200) -> Optional[str]:
        """Enhanced API call with better error handling and retry logic."""
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "format": "json",
            "options": {
                "temperature": 0.0,  # Deterministic for structured output
                "top_p": 0.9,
                "top_k": 20,
                "num_predict": max_tokens,
                "stop": ["\n\n", "---", "Exemplos:", "Examples:", "Nota:", "Note:"],
                "repeat_penalty": 1.1,
                "num_ctx": 4096,  # Context window for 8B model
            }
        }

        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"API call attempt {attempt + 1}/{self.max_retries}")
                response = requests.post(self.ollama_url, json=payload, timeout=120)

                if response.status_code == 200:
                    result = response.json()
                    response_text = result.get('response', '').strip()
                    if response_text:
                        return response_text
                    else:
                        self.logger.warning("Empty response from API")
                else:
                    self.logger.warning(f"API error {response.status_code}: {response.text[:200]}...")

            except requests.exceptions.Timeout:
                self.logger.warning(f"Request timeout on attempt {attempt + 1}")
            except requests.exceptions.ConnectionError:
                self.logger.warning(f"Connection error on attempt {attempt + 1}")
            except Exception as e:
                self.logger.warning(f"Request error on attempt {attempt + 1}: {e}")

            if attempt < self.max_retries - 1:
                wait_time = (2 ** attempt) + self.request_delay
                self.logger.debug(f"Waiting {wait_time:.1f}s before retry...")
                time.sleep(wait_time)

        self.logger.error("All API call attempts failed")
        return None

    def _create_enhanced_extraction_prompt(self, phrase: str, context: Dict, phrase_type: str = None) -> str:
        """Create an enhanced, more specific prompt for small language models."""
        section_info = context.get('breadcrumb', 'Seção Desconhecida')
        phrase_category = phrase_type or context.get('metadata', {}).get('phrase_type', 'geral')

        # Create more specific instructions based on phrase type
        specific_instructions = {
            'dosage_instruction': 'Foque em doses, quantidades, frequências de administração.',
            'indication': 'Extraia para que condições ou doenças o medicamento é indicado.',
            'contraindication': 'Identifique quando o medicamento NÃO deve ser usado.',
            'side_effect': 'Extraia efeitos adversos, reações indesejáveis.',
            'precaution': 'Identifique cuidados, precauções, advertências.',
            'numerical_data': 'Extraia dados numéricos relevantes (doses, concentrações).',
            'general_information': 'Extraia qualquer informação farmacêutica relevante.'
        }

        instruction = specific_instructions.get(phrase_category, specific_instructions['general_information'])

        return f"""Você é um especialista em extrair informações farmacêuticas. Analise esta frase e extraia APENAS fatos reais como triplas JSON.

CONTEXTO: {section_info}
TIPO: {phrase_category}
INSTRUÇÃO: {instruction}

FRASE: "{phrase}"

REGRAS IMPORTANTES:
1. Extraia SOMENTE informações que estão EXPLÍCITAS na frase
2. NÃO invente ou suponha informações
3. Use nomes de medicamentos exatos quando mencionados
4. Para doses, inclua unidades (mg, ml, etc.)
5. Se não há informação farmacêutica específica, retorne []

FORMATO: Array JSON de triplas [entidade, relação, valor]

EXEMPLOS DE FORMATO (NÃO COPIE O CONTEÚDO):
- [["Paracetamol", "tem_dose", "500mg"]]
- [["medicamento", "é_indicado_para", "dor de cabeça"]]
- [["substância", "pode_causar", "náusea"]]

JSON:"""

    def _parse_triples_response_enhanced(self, response: str) -> List[List[str]]:
        """Enhanced parsing with better error handling and validation."""
        if not response:
            return []

        # Find the JSON array within the response string
        match = re.search(r'\[\s*(\[.*?\])\s*\]', response, re.DOTALL)
        if not match:
            # Fallback for single triple
            match = re.search(r'(\[.*?\])', response, re.DOTALL)

        if not match:
            self.logger.warning(f"Could not find a valid JSON array/object in response: {response[:100]}...")
            return []

        clean_json_str = match.group(0)

        try:
            # First, try to load as is, assuming it's a list of lists
            data = json.loads(clean_json_str)
            if isinstance(data, list) and all(isinstance(item, list) and len(item) == 3 for item in data):
                return self._validate_and_filter_triples(data)
            # Handle if it's a single list that should be wrapped in another list
            if isinstance(data, list) and len(data) == 3 and all(isinstance(item, str) for item in data):
                 return self._validate_and_filter_triples([data])
        except json.JSONDecodeError:
            self.logger.warning(f"Could not parse response: {clean_json_str[:100]}...")
            return []

        return []


    def _validate_and_filter_triples(self, triples: List[List[str]]) -> List[List[str]]:
        """Validate and filter extracted triples for quality."""
        valid_triples = []
        template_entities = ['medication', 'medicamento', 'paracetamol', 'substância', 'fármaco']
        template_values = ['500mg', 'comprimidos', 'dor de cabeça', 'náusea', 'exemplo']

        for triple in triples:
            if not isinstance(triple, list) or len(triple) != 3:
                continue

            entity, relation, value = [str(x).strip() for x in triple]

            if not all([entity, relation, value]) or any(len(x) < 2 for x in [entity, relation, value]):
                continue

            if (entity.lower() in template_entities and
                any(tv in value.lower() for tv in template_values)):
                continue

            if any(x.startswith('<') and x.endswith('>') for x in [entity, relation, value]):
                continue

            generic_relations = ['é', 'tem', 'faz', 'usa']
            if relation.lower() in generic_relations and len(value) < 5:
                continue

            valid_triples.append([entity, relation, value])

        return valid_triples

    def _should_process_phrase(self, phrase: str, metadata: Dict = None) -> bool:
        """Enhanced logic to determine if a phrase should be processed."""
        if len(phrase.strip()) < 15:
            return False

        phrase_lower = phrase.lower()
        has_pharma_content = any(kw in phrase_lower for kw in self.pharma_keywords)
        if metadata:
            phrase_type = metadata.get('phrase_type', '')
            if phrase_type in ['dosage_instruction', 'indication', 'contraindication', 'side_effect']:
                return True

        has_numbers = bool(re.search(r'\d', phrase))
        has_units = bool(re.search(r'\d+\s*(mg|ml|g|%|mcg|μg)', phrase_lower))

        return has_pharma_content or has_units or (has_numbers and len(phrase) > 30)

    def _extract_phrase_knowledge(self, phrase_data: Dict) -> Dict:
        """Extract knowledge from a single phrase block with enhanced processing."""
        phrase_id = phrase_data.get('phrase_id', 'unknown')
        phrase_content = phrase_data.get('content', '').strip()
        context = phrase_data.get('context', {})
        metadata = phrase_data.get('metadata', {})

        if not self._should_process_phrase(phrase_content, metadata):
            self.stats['skipped_irrelevant'] += 1
            return {
                'phrase_id': phrase_id,
                'triples': [],
                'status': 'skipped_irrelevant'
            }

        self.logger.debug(f"Processing phrase {phrase_id}: {phrase_content[:50]}...")
        self.stats['phrases_processed'] += 1

        try:
            phrase_type = metadata.get('phrase_type')
            prompt = self._create_enhanced_extraction_prompt(phrase_content, context, phrase_type)

            self.prompt_logger.info(f"--- START PHRASE: {phrase_id} ---")
            self.prompt_logger.info(f"PHRASE TEXT: {phrase_content}")
            self.prompt_logger.info(f"PHRASE TYPE: {phrase_type}")
            self.prompt_logger.info(f"CONTEXT: {context.get('breadcrumb', 'N/A')}")
            self.prompt_logger.info(f"PROMPT SENT:\n{prompt}")

            response = self._call_ollama_api(prompt, max_tokens=300)

            self.prompt_logger.info(f"RAW RESPONSE RECEIVED:\n{response}")
            self.prompt_logger.info(f"--- END PHRASE: {phrase_id} ---\n")

            if response:
                triples = self._parse_triples_response_enhanced(response)
                if triples:
                    self.stats['successful_extractions'] += 1
                    self.stats['total_triples'] += len(triples)
                    self.logger.debug(f"✅ Extracted {len(triples)} triples from phrase {phrase_id}")
                else:
                    self.stats['failed_extractions'] += 1

                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'phrase_type': phrase_type,
                    'context': context.get('breadcrumb'),
                    'triples': triples,
                    'status': 'success' if triples else 'no_triples_found'
                }
            else:
                self.stats['failed_extractions'] += 1
                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'triples': [],
                    'status': 'api_failed'
                }

        except Exception as e:
            self.logger.error(f"Error processing phrase {phrase_id}: {e}")
            self.stats['failed_extractions'] += 1
            return {
                'phrase_id': phrase_id,
                'phrase_text': phrase_content,
                'triples': [],
                'status': f'error: {str(e)}'
            }
        finally:
            time.sleep(self.request_delay)

    # ... (Keep other methods like _extract_table_knowledge if you need them) ...

    def process_phrase_based_json(self, input_file: Path) -> Optional[Dict]:
        """Process a phrase-based JSON file from the enhanced parser."""
        self.logger.info(f"📄 Processing phrase-based file: {input_file.name}")
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            self.logger.error(f"Failed to load {input_file}: {e}")
            return None

        phrase_blocks = data.get('document_structure', {}).get('phrase_blocks', [])
        if not phrase_blocks:
            self.logger.warning(f"No phrase_blocks found in {input_file}")
            return None
        self.logger.info(f"Found {len(phrase_blocks)} phrase blocks")

        phrase_extractions = []
        for phrase_data in phrase_blocks:
            result = self._extract_phrase_knowledge(phrase_data)
            phrase_extractions.append(result)
            self.stats['phrase_blocks_processed'] += 1

        all_triples = [triple for extraction in phrase_extractions for triple in extraction.get('triples', [])]

        result = {
            'document_metadata': data.get('document_metadata', {}),
            'extraction_summary': {
                'extraction_timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'total_triples_extracted': len(all_triples),
            },
            'phrase_extractions': phrase_extractions,
            'all_extracted_triples': all_triples,
        }

        self.stats['files_processed'] += 1
        self.logger.info(f"✅ Completed {input_file.name}: {len(all_triples)} total triples extracted")
        return result

    def process_directory(self, input_dir: Path, output_dir: Path):
        """Process all phrase-optimized JSON files in a directory."""
        output_dir.mkdir(parents=True, exist_ok=True)
        json_files = sorted(list(input_dir.glob('*_phrase_optimized.json')))
        if not json_files:
            self.logger.warning(f"No *_phrase_optimized.json files found in {input_dir}")
            return

        self.logger.info(f"Found {len(json_files)} phrase-optimized files to process")

        for i, json_file in enumerate(json_files):
            self.logger.info(f"\n📊 Progress: Processing file {i + 1}/{len(json_files)}")
            result = self.process_phrase_based_json(json_file)
            if result:
                output_name = json_file.stem.replace('_phrase_optimized', '_enhanced_graph_data') + '.json'
                output_file = output_dir / output_name
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)
                self.logger.info(f"💾 Saved results to: {output_file}")

        self._generate_enhanced_report(output_dir)

    def _generate_enhanced_report(self, output_dir: Path):
        """Generate comprehensive final report."""
        # ... (This method can be kept as is) ...
        pass

    def _print_summary_stats(self):
        """Print summary statistics to console."""
        # ... (This method can be kept as is) ...
        pass


# ==============================================================================
# 🚀 MAIN EXECUTION SECTION
# ==============================================================================

# --- CONFIGURATION ---
# IMPORTANT: Change these paths to match your Google Drive folders.
DRIVE_INPUT_DIR = "processed_pdfs"
DRIVE_OUTPUT_DIR = "enhanced_graph_data"

# --- CHANGE MADE HERE ---
# Switched from "llama3.2:3b" to "llama3:8b" for better performance.
OLLAMA_MODEL = "llama3:8b"

REQUEST_DELAY = 0.5
MAX_RETRIES = 3

def main():
    """Main execution function with enhanced error handling and logging."""
    print("🚀 Starting Enhanced Pharmaceutical Knowledge Graph Extractor")
    try:
        print("🔧 Mounting Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        print("✅ Google Drive mounted successfully.")

        drive_base_path = Path('/content/drive/MyDrive/')
        full_input_path = drive_base_path / DRIVE_INPUT_DIR
        full_output_path = drive_base_path / DRIVE_OUTPUT_DIR

        print(f"📁 Input Directory: {full_input_path}")
        print(f"📂 Output Directory: {full_output_path}")
        print(f"🤖 Model: {OLLAMA_MODEL}")

        if not full_input_path.exists():
            print(f"❌ ERROR: Input directory not found at '{full_input_path}'")
            return

        print("🤖 Initializing enhanced knowledge extractor...")
        extractor = EnhancedPharmaceuticalKnowledgeExtractor(
            model_name=OLLAMA_MODEL,
            max_retries=MAX_RETRIES,
            request_delay=REQUEST_DELAY
        )

        print("🚀 Starting batch processing...")
        extractor.process_directory(
            input_dir=full_input_path,
            output_dir=full_output_path
        )

        print("\n🎉 All files processed successfully!")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 Starting Enhanced Pharmaceutical Knowledge Graph Extractor
🔧 Mounting Google Drive...
Mounted at /content/drive
✅ Google Drive mounted successfully.
📁 Input Directory: /content/drive/MyDrive/processed_pdfs
📂 Output Directory: /content/drive/MyDrive/enhanced_graph_data
🤖 Model: llama3:8b
🤖 Initializing enhanced knowledge extractor...


2025-09-01 17:54:37,662 - INFO - ✅ Successfully connected to Ollama with llama3:8b
2025-09-01 17:54:37,719 - INFO - Found 339 phrase-optimized files to process
2025-09-01 17:54:37,719 - INFO - 
📊 Progress: Processing file 1/339
2025-09-01 17:54:37,721 - INFO - 📄 Processing phrase-based file: bula_1755192077396_phrase_optimized.json
2025-09-01 17:54:37,737 - INFO - Found 120 phrase blocks
2025-09-01 17:54:37,738 - INFO - --- START PHRASE: phrase_2 ---
2025-09-01 17:54:37,739 - INFO - PHRASE TEXT: O colesterol é uma das várias substâncias gordurosas encontradas na corrente sanguínea.
2025-09-01 17:54:37,741 - INFO - PHRASE TYPE: general_information
2025-09-01 17:54:37,742 - INFO - CONTEXT: Section 1: PARA QUE ESTE MEDICAMENTO É INDICADO?
2025-09-01 17:54:37,743 - INFO - PROMPT SENT:
Você é um especialista em extrair informações farmacêuticas. Analise esta frase e extraia APENAS fatos reais como triplas JSON.

CONTEXTO: Section 1: PARA QUE ESTE MEDICAMENTO É INDICADO?
TIPO: general_inform

🚀 Starting batch processing...


2025-09-01 17:54:38,840 - INFO - RAW RESPONSE RECEIVED:
{"colesterol": "é_substância_gordurosa_encontrada_na_corrente_sanguínea"}
2025-09-01 17:54:38,841 - INFO - --- END PHRASE: phrase_2 ---

2025-09-01 17:54:39,344 - INFO - --- START PHRASE: phrase_7 ---
2025-09-01 17:54:39,345 - INFO - PHRASE TEXT: O colesterol HDL, por sua vez, é frequentemente chamado de “bom colesterol" porque ajuda a evitar o depósito de “mau colesterol" nas artérias e protege contra doenças do coração.
2025-09-01 17:54:39,346 - INFO - PHRASE TYPE: general_information
2025-09-01 17:54:39,347 - INFO - CONTEXT: Section 1: PARA QUE ESTE MEDICAMENTO É INDICADO?
2025-09-01 17:54:39,347 - INFO - PROMPT SENT:
Você é um especialista em extrair informações farmacêuticas. Analise esta frase e extraia APENAS fatos reais como triplas JSON.

CONTEXTO: Section 1: PARA QUE ESTE MEDICAMENTO É INDICADO?
TIPO: general_information
INSTRUÇÃO: Extraia qualquer informação farmacêutica relevante.

FRASE: "O colesterol HDL, por sua vez

KeyboardInterrupt: 

In [None]:
# Install necessary libraries
!pip install pymupdf4llm pdfplumber pandas requests -q

# Download and install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

import subprocess
import time

# Start the ollama server process in the background
# Its output will be redirected to a log file
server_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=open("ollama_server.log", "w"),
    stderr=subprocess.STDOUT
)

print("✅ Ollama server started in the background.")
time.sleep(5) # Give the server a moment to initialize

# --- CHANGE MADE HERE ---
# Pull the Llama 3 8B model instead of the 3B version.
print("📥 Pulling the Llama 3 8B model. This may take a few minutes...")
!ollama pull llama3:8b
print("✅ Model download complete.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25h>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama 

paragraph

In [None]:
#!/usr/bin/env python3
"""
Enhanced Pharmaceutical Knowledge Graph Extractor for Google Colab - Phrase-Based JSON Processing

Processes phrase-optimized JSON files from the enhanced pharmaceutical document parser.
Optimized for small language models with robust extraction and error handling.
Includes detailed logging of prompts and responses for debugging model performance.
"""

import json
import os
import re
import requests
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import logging
from google.colab import drive

# ==============================================================================
# CORE LOGIC: Enhanced PharmaceuticalKnowledgeExtractor CLASS
# ==============================================================================

class EnhancedPharmaceuticalKnowledgeExtractor:
    def __init__(self,
                 model_name: str = "llama3:8b",  # UPDATED for Llama 3 8B
                 ollama_url: str = "http://localhost:11434/api/generate",
                 max_retries: int = 3,
                 request_delay: float = 0.5):
        """
        Initialize the enhanced knowledge extractor optimized for phrase-based JSON files.
        """
        self.model_name = model_name
        self.ollama_url = ollama_url
        self.max_retries = max_retries
        self.request_delay = request_delay

        self.stats = {
            'files_processed': 0,
            'phrase_blocks_processed': 0,
            'table_blocks_processed': 0,
            'phrases_processed': 0,
            'successful_extractions': 0,
            'failed_extractions': 0,
            'total_triples': 0,
            'skipped_irrelevant': 0
        }

        # Enhanced patterns for better pharmaceutical content detection
        self.pharma_keywords = [
            'mg', 'ml', 'g/', 'mcg', 'μg', '%', 'dose', 'dosagem', 'posologia',
            'comprimido', 'cápsula', 'medicamento', 'fármaco', 'droga',
            'indicação', 'indicado', 'tratamento', 'terapia',
            'contraindicação', 'contraindicado', 'não usar', 'evitar',
            'efeito', 'reação', 'adverso', 'colateral', 'indesejável',
            'alergia', 'hipersensibilidade', 'intolerância',
            'administração', 'aplicar', 'tomar', 'ingerir',
            'composição', 'princípio ativo', 'substância', 'excipiente',
            'interação', 'interagir', 'incompatível', 'interferir',
            'gravidez', 'gestação', 'lactação', 'amamentação',
            'criança', 'pediátrico', 'adulto', 'idoso', 'geriátrico'
        ]

        self._setup_logging()
        self._test_ollama_connection()

    def _setup_logging(self):
        """Setup enhanced logging configuration."""
        # Remove existing handlers to avoid duplicates in Colab
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        # General logger for progress and errors
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('enhanced_pharma_extraction.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

        # Dedicated logger for prompts and responses
        self.prompt_logger = logging.getLogger('prompt_logger')
        self.prompt_logger.setLevel(logging.INFO)
        prompt_handler = logging.FileHandler('enhanced_prompts_and_responses.log', mode='w')
        prompt_formatter = logging.Formatter('%(message)s')
        prompt_handler.setFormatter(prompt_formatter)

        # Avoid adding handlers if they already exist
        if not self.prompt_logger.handlers:
            self.prompt_logger.addHandler(prompt_handler)

    def _test_ollama_connection(self):
        """Test connection to Ollama API with enhanced error reporting."""
        try:
            test_payload = {
                "model": self.model_name,
                "prompt": "Teste de conexão. Responda apenas 'OK'.",
                "stream": False,
                "format": "json",
                "options": {"temperature": 0.0, "num_predict": 10}
            }
            response = requests.post(self.ollama_url, json=test_payload, timeout=15)
            if response.status_code == 200:
                self.logger.info(f"✅ Successfully connected to Ollama with {self.model_name}")
                result = response.json()
                self.logger.debug(f"Test response: {result.get('response', 'No response')}")
            else:
                self.logger.warning(f"⚠️ Ollama connection test failed: {response.status_code} - {response.text}")
        except requests.exceptions.ConnectionError:
            self.logger.error("❌ Cannot connect to Ollama API. Ensure it's running and accessible from this Colab notebook.")
            raise ConnectionError("Cannot connect to Ollama API. Ensure it's running and accessible (e.g., via ngrok).")
        except Exception as e:
            self.logger.error(f"❌ Failed to connect to Ollama: {e}")
            raise ConnectionError(f"Ollama connection failed: {e}")

    def _call_ollama_api(self, prompt: str, max_tokens: int = 200) -> Optional[str]:
        """Enhanced API call with better error handling and retry logic."""
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "format": "json",
            "options": {
                "temperature": 0.0,  # Deterministic for structured output
                "top_p": 0.9,
                "top_k": 20,
                "num_predict": max_tokens,
                "stop": ["\n\n", "---", "Exemplos:", "Examples:", "Nota:", "Note:"],
                "repeat_penalty": 1.1,
                "num_ctx": 4096,  # UPDATED: Context window increased for Llama 3 8B
            }
        }

        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"API call attempt {attempt + 1}/{self.max_retries}")
                response = requests.post(self.ollama_url, json=payload, timeout=120)

                if response.status_code == 200:
                    result = response.json()
                    response_text = result.get('response', '').strip()
                    if response_text:
                        return response_text
                    else:
                        self.logger.warning("Empty response from API")
                else:
                    self.logger.warning(f"API error {response.status_code}: {response.text[:200]}...")

            except requests.exceptions.Timeout:
                self.logger.warning(f"Request timeout on attempt {attempt + 1}")
            except requests.exceptions.ConnectionError:
                self.logger.warning(f"Connection error on attempt {attempt + 1}")
            except Exception as e:
                self.logger.warning(f"Request error on attempt {attempt + 1}: {e}")

            if attempt < self.max_retries - 1:
                wait_time = (2 ** attempt) + self.request_delay
                self.logger.debug(f"Waiting {wait_time:.1f}s before retry...")
                time.sleep(wait_time)

        self.logger.error("All API call attempts failed")
        return None

    def _create_enhanced_extraction_prompt(self, phrase: str, context: Dict, phrase_type: str = None) -> str:
        """Create an enhanced, more specific prompt for small language models."""
        section_info = context.get('breadcrumb', 'Seção Desconhecida')
        phrase_category = phrase_type or context.get('metadata', {}).get('phrase_type', 'geral')

        # Create more specific instructions based on phrase type
        specific_instructions = {
            'dosage_instruction': 'Foque em doses, quantidades, frequências de administração.',
            'indication': 'Extraia para que condições ou doenças o medicamento é indicado.',
            'contraindication': 'Identifique quando o medicamento NÃO deve ser usado.',
            'side_effect': 'Extraia efeitos adversos, reações indesejáveis.',
            'precaution': 'Identifique cuidados, precauções, advertências.',
            'numerical_data': 'Extraia dados numéricos relevantes (doses, concentrações).',
            'general_information': 'Extraia qualquer informação farmacêutica relevante.'
        }

        instruction = specific_instructions.get(phrase_category, specific_instructions['general_information'])

        return f"""Você é um especialista em extrair informações farmacêuticas. Analise esta frase e extraia APENAS fatos reais como triplas JSON.

CONTEXTO: {section_info}
TIPO: {phrase_category}
INSTRUÇÃO: {instruction}

FRASE: "{phrase}"

REGRAS IMPORTANTES:
1. Extraia SOMENTE informações que estão EXPLÍCITAS na frase
2. NÃO invente ou suponha informações
3. Use nomes de medicamentos exatos quando mencionados
4. Para doses, inclua unidades (mg, ml, etc.)
5. Se não há informação farmacêutica específica, retorne []

FORMATO: Array JSON de triplas [entidade, relação, valor]

EXEMPLOS DE FORMATO (NÃO COPIE O CONTEÚDO):
- [["Paracetamol", "tem_dose", "500mg"]]
- [["medicamento", "é_indicado_para", "dor de cabeça"]]
- [["substância", "pode_causar", "náusea"]]

JSON:"""

    def _parse_triples_response_enhanced(self, response: str) -> List[List[str]]:
        """Enhanced parsing with better error handling and validation."""
        if not response:
            return []

        # Clean the response
        cleaned = re.sub(r'```json\s*|```\s*', '', response.strip())
        cleaned = re.sub(r'^[^[]*', '', cleaned)  # Remove text before first [
        cleaned = re.sub(r'[^]]*$', ']', cleaned)  # Ensure ends with ]

        # Try multiple parsing strategies
        strategies = [
            self._parse_json_array,
            self._parse_regex_triples,
            self._parse_fallback_patterns
        ]

        for strategy in strategies:
            try:
                triples = strategy(cleaned)
                if triples:
                    return self._validate_and_filter_triples(triples)
            except Exception as e:
                self.logger.debug(f"Parsing strategy failed: {e}")
                continue

        self.logger.warning(f"Could not parse response: {cleaned[:100]}...")
        return []

    def _parse_json_array(self, text: str) -> List[List[str]]:
        """Parse JSON array directly."""
        # Find the JSON array pattern
        array_match = re.search(r'\[.*?\]', text, re.DOTALL)
        if array_match:
            json_str = array_match.group(0)
            parsed = json.loads(json_str)
            if isinstance(parsed, list):
                return parsed
        return []

    def _parse_regex_triples(self, text: str) -> List[List[str]]:
        """Parse using regex patterns for triple extraction."""
        patterns = [
            r'\["([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\]',  # Standard format
            r'\[\"([^\"]+)\",\s*\"([^\"]+)\",\s*\"([^\"]+)\"\]',  # Escaped quotes
            r'<([^>]+)>\s*,\s*<([^>]+)>\s*,\s*<([^>]+)>'  # Angle bracket format
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text)
            if matches:
                return [[str(x).strip() for x in match] for match in matches]
        return []

    def _parse_fallback_patterns(self, text: str) -> List[List[str]]:
        """Fallback parsing for malformed but recognizable patterns."""
        # Look for entity-relation-value patterns
        lines = text.split('\n')
        triples = []

        for line in lines:
            # Pattern: "entity" relation "value"
            pattern = r'"([^"]+)"\s+(\w+)\s+"([^"]+)"'
            match = re.search(pattern, line)
            if match:
                triples.append([match.group(1), match.group(2), match.group(3)])

        return triples

    def _validate_and_filter_triples(self, triples: List[List[str]]) -> List[List[str]]:
        """Validate and filter extracted triples for quality."""
        valid_triples = []

        # Filter out common template examples and invalid entries
        template_entities = ['medication', 'medicamento', 'paracetamol', 'substância', 'fármaco']
        template_values = ['500mg', 'comprimidos', 'dor de cabeça', 'náusea', 'exemplo']

        for triple in triples:
            if not isinstance(triple, list) or len(triple) != 3:
                continue

            entity, relation, value = [str(x).strip() for x in triple]

            # Skip if any component is empty or too short
            if not all([entity, relation, value]) or any(len(x) < 2 for x in [entity, relation, value]):
                continue

            # Skip template examples
            if (entity.lower() in template_entities and
                any(tv in value.lower() for tv in template_values)):
                continue

            # Skip placeholder patterns
            if any(x.startswith('<') and x.endswith('>') for x in [entity, relation, value]):
                continue

            # Skip overly generic relations
            generic_relations = ['é', 'tem', 'faz', 'usa']
            if relation.lower() in generic_relations and len(value) < 5:
                continue

            valid_triples.append([entity, relation, value])

        return valid_triples

    def _should_process_phrase(self, phrase: str, metadata: Dict = None) -> bool:
        """Enhanced logic to determine if a phrase should be processed."""
        if len(phrase.strip()) < 15:
            return False

        phrase_lower = phrase.lower()

        # Check for pharmaceutical keywords
        has_pharma_content = any(kw in phrase_lower for kw in self.pharma_keywords)

        # Check phrase type from metadata
        if metadata:
            phrase_type = metadata.get('phrase_type', '')
            if phrase_type in ['dosage_instruction', 'indication', 'contraindication', 'side_effect']:
                return True

        # Additional checks for numerical data that might be relevant
        has_numbers = bool(re.search(r'\d', phrase))
        has_units = bool(re.search(r'\d+\s*(mg|ml|g|%|mcg|μg)', phrase_lower))

        return has_pharma_content or has_units or (has_numbers and len(phrase) > 30)

    def _extract_phrase_knowledge(self, phrase_data: Dict) -> Dict:
        """Extract knowledge from a single phrase block with enhanced processing."""
        phrase_id = phrase_data.get('phrase_id', 'unknown')
        phrase_content = phrase_data.get('content', '').strip()
        context = phrase_data.get('context', {})
        metadata = phrase_data.get('metadata', {})

        if not self._should_process_phrase(phrase_content, metadata):
            self.stats['skipped_irrelevant'] += 1
            return {
                'phrase_id': phrase_id,
                'triples': [],
                'status': 'skipped_irrelevant'
            }

        self.logger.debug(f"Processing phrase {phrase_id}: {phrase_content[:50]}...")
        self.stats['phrases_processed'] += 1

        try:
            phrase_type = metadata.get('phrase_type')
            prompt = self._create_enhanced_extraction_prompt(phrase_content, context, phrase_type)

            # Log the interaction
            self.prompt_logger.info(f"--- START PHRASE: {phrase_id} ---")
            self.prompt_logger.info(f"PHRASE TEXT: {phrase_content}")
            self.prompt_logger.info(f"PHRASE TYPE: {phrase_type}")
            self.prompt_logger.info(f"CONTEXT: {context.get('breadcrumb', 'N/A')}")
            self.prompt_logger.info(f"PROMPT SENT:\n{prompt}")

            response = self._call_ollama_api(prompt, max_tokens=300)

            self.prompt_logger.info(f"RAW RESPONSE RECEIVED:\n{response}")
            self.prompt_logger.info(f"--- END PHRASE: {phrase_id} ---\n")

            if response:
                triples = self._parse_triples_response_enhanced(response)
                if triples:
                    self.stats['successful_extractions'] += 1
                    self.stats['total_triples'] += len(triples)
                    self.logger.debug(f"✅ Extracted {len(triples)} triples from phrase {phrase_id}")
                else:
                    self.stats['failed_extractions'] += 1

                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'phrase_type': phrase_type,
                    'context': context.get('breadcrumb'),
                    'triples': triples,
                    'status': 'success' if triples else 'no_triples_found'
                }
            else:
                self.stats['failed_extractions'] += 1
                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'triples': [],
                    'status': 'api_failed'
                }

        except Exception as e:
            self.logger.error(f"Error processing phrase {phrase_id}: {e}")
            self.stats['failed_extractions'] += 1
            return {
                'phrase_id': phrase_id,
                'phrase_text': phrase_content,
                'triples': [],
                'status': f'error: {str(e)}'
            }
        finally:
            time.sleep(self.request_delay)

    def _extract_table_knowledge(self, table_data: Dict) -> Dict:
        """Extract knowledge from table blocks with structured data handling."""
        table_id = table_data.get('table_id', 'unknown')
        content = table_data.get('content', {})
        context = table_data.get('context', {})
        metadata = table_data.get('metadata', {})

        self.logger.info(f"Processing table {table_id}")
        self.stats['table_blocks_processed'] += 1

        # Convert table to text for processing
        formatted_text = content.get('formatted_text', '')
        header = content.get('header', [])
        data_rows = content.get('data_rows', [])

        if not formatted_text and not data_rows:
            return {
                'table_id': table_id,
                'triples': [],
                'status': 'empty_table'
            }

        # Process table as structured text
        table_text = formatted_text or self._format_table_as_text(header, data_rows)

        # Use table-specific processing
        try:
            prompt = self._create_table_extraction_prompt(table_text, context, metadata)

            self.prompt_logger.info(f"--- START TABLE: {table_id} ---")
            self.prompt_logger.info(f"TABLE CONTENT:\n{table_text}")
            self.prompt_logger.info(f"PROMPT SENT:\n{prompt}")

            response = self._call_ollama_api(prompt, max_tokens=400)

            self.prompt_logger.info(f"RAW RESPONSE RECEIVED:\n{response}")
            self.prompt_logger.info(f"--- END TABLE: {table_id} ---\n")

            if response:
                triples = self._parse_triples_response_enhanced(response)
                if triples:
                    self.stats['successful_extractions'] += 1
                    self.stats['total_triples'] += len(triples)

                return {
                    'table_id': table_id,
                    'table_type': metadata.get('table_type'),
                    'context': context.get('breadcrumb'),
                    'triples': triples,
                    'status': 'success' if triples else 'no_triples_found'
                }

        except Exception as e:
            self.logger.error(f"Error processing table {table_id}: {e}")
            self.stats['failed_extractions'] += 1

        return {
            'table_id': table_id,
            'triples': [],
            'status': 'error'
        }

    def _create_table_extraction_prompt(self, table_text: str, context: Dict, metadata: Dict) -> str:
        """Create specialized prompt for table data extraction."""
        table_type = metadata.get('table_type', 'general_data')
        section_info = context.get('breadcrumb', 'Tabela')

        type_instructions = {
            'dosage_schedule': 'Extraia informações de dosagem, horários, frequências.',
            'dosage_information': 'Foque em doses, concentrações, quantidades.',
            'age_specific_data': 'Extraia dados específicos por idade ou grupo.',
            'general_data': 'Extraia qualquer informação farmacêutica estruturada.'
        }

        instruction = type_instructions.get(table_type, type_instructions['general_data'])

        return f"""Analise esta tabela farmacêutica e extraia informações estruturadas como triplas JSON.

CONTEXTO: {section_info}
TIPO DE TABELA: {table_type}
INSTRUÇÃO: {instruction}

TABELA:
{table_text}

REGRAS:
1. Extraia APENAS dados que estão na tabela
2. Para doses, mantenha unidades (mg, ml, etc.)
3. Preserve nomes de medicamentos exatos
4. Se há múltiplas linhas, extraia informação de cada linha relevante
5. Use "linha_N" ou "item_N" para distinguir entradas quando necessário

FORMATO: Array JSON de triplas [entidade, relação, valor]

JSON:"""

    def _format_table_as_text(self, header: List[str], data_rows: List[List[str]]) -> str:
        """Format table data as readable text."""
        if not data_rows:
            return ""

        lines = []
        if header:
            lines.append(" | ".join(header))
            lines.append("-" * (len(" | ".join(header))))

        for row in data_rows:
            lines.append(" | ".join(str(cell) if cell else "" for cell in row))

        return "\n".join(lines)

    def process_phrase_based_json(self, input_file: Path) -> Optional[Dict]:
        """Process a phrase-based JSON file from the enhanced parser."""
        self.logger.info(f"📄 Processing phrase-based file: {input_file.name}")

        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            self.logger.error(f"Failed to load {input_file}: {e}")
            return None

        # Get the document structure
        doc_structure = data.get('document_structure', {})
        phrase_blocks = doc_structure.get('phrase_blocks', [])
        table_blocks = doc_structure.get('table_blocks', [])

        if not phrase_blocks and not table_blocks:
            self.logger.warning(f"No phrase_blocks or table_blocks found in {input_file}")
            return None

        self.logger.info(f"Found {len(phrase_blocks)} phrase blocks and {len(table_blocks)} table blocks")

        # Process phrase blocks
        phrase_extractions = []
        for phrase_data in phrase_blocks:
            result = self._extract_phrase_knowledge(phrase_data)
            phrase_extractions.append(result)
            self.stats['phrase_blocks_processed'] += 1

        # Process table blocks
        table_extractions = []
        for table_data in table_blocks:
            result = self._extract_table_knowledge(table_data)
            table_extractions.append(result)

        # Collect all triples
        all_triples = []
        for extraction in phrase_extractions + table_extractions:
            all_triples.extend(extraction.get('triples', []))

        result = {
            'document_metadata': data.get('document_metadata', {}),
            'extraction_summary': {
                'extraction_timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'processing_method': 'enhanced_phrase_based',
                'total_phrase_blocks': len(phrase_blocks),
                'total_table_blocks': len(table_blocks),
                'total_phrases_processed': self.stats['phrases_processed'],
                'total_triples_extracted': len(all_triples),
                'successful_extractions': self.stats['successful_extractions'],
                'failed_extractions': self.stats['failed_extractions'],
                'skipped_irrelevant': self.stats['skipped_irrelevant']
            },
            'phrase_extractions': phrase_extractions,
            'table_extractions': table_extractions,
            'all_extracted_triples': all_triples,
            'metadata': data.get('metadata', {})
        }

        self.stats['files_processed'] += 1
        self.logger.info(f"✅ Completed {input_file.name}: {len(all_triples)} total triples extracted")
        return result

    def process_directory(self, input_dir: Path, output_dir: Path):
        """Process all phrase-optimized JSON files in a directory."""
        if not input_dir.is_dir():
            raise FileNotFoundError(f"Input directory not found: {input_dir}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # Look for phrase-optimized JSON files
        json_files = list(input_dir.glob('*_phrase_optimized.json'))
        if not json_files:
            self.logger.warning(f"No *_phrase_optimized.json files found in {input_dir}")
            return

        self.logger.info(f"Found {len(json_files)} phrase-optimized files to process")

        for i, json_file in enumerate(json_files):
            self.logger.info(f"\n📊 Progress: Processing file {i + 1}/{len(json_files)}")

            # Reset per-file counters
            prev_phrases = self.stats['phrases_processed']
            prev_successful = self.stats['successful_extractions']

            result = self.process_phrase_based_json(json_file)

            if result:
                output_name = json_file.stem.replace('_phrase_optimized', '_enhanced_graph_data') + '.json'
                output_file = output_dir / output_name

                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)

                # Log file-specific stats
                phrases_this_file = self.stats['phrases_processed'] - prev_phrases
                successful_this_file = self.stats['successful_extractions'] - prev_successful

                self.logger.info(f"💾 Saved results to: {output_file}")
                self.logger.info(f"📊 File stats: {phrases_this_file} phrases processed, {successful_this_file} successful extractions")

        self._generate_enhanced_report(output_dir)

    def _generate_enhanced_report(self, output_dir: Path):
        """Generate comprehensive final report."""
        report = {
            'summary': {
                'timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'processing_method': 'enhanced_phrase_based',
                'total_files_processed': self.stats['files_processed'],
            },
            'detailed_statistics': self.stats,
            'performance_metrics': {
                'success_rate': (
                    self.stats['successful_extractions'] /
                    max(self.stats['phrases_processed'], 1) * 100
                ),
                'avg_triples_per_successful_extraction': (
                    self.stats['total_triples'] /
                    max(self.stats['successful_extractions'], 1)
                ),
                'processing_efficiency': {
                    'phrases_processed': self.stats['phrases_processed'],
                    'relevant_phrases': self.stats['phrases_processed'] - self.stats['skipped_irrelevant'],
                    'relevance_rate': (
                        (self.stats['phrases_processed'] - self.stats['skipped_irrelevant']) /
                        max(self.stats['phrases_processed'], 1) * 100
                    )
                }
            }
        }

        report_file = output_dir / 'enhanced_final_extraction_report.json'
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

        self.logger.info(f"📊 Enhanced final report saved: {report_file}")
        self._print_summary_stats()

    def _print_summary_stats(self):
        """Print summary statistics to console."""
        print("\n" + "=" * 70)
        print("🎯 EXTRACTION SUMMARY")
        print("=" * 70)
        print(f"📁 Files processed: {self.stats['files_processed']}")
        print(f"🧩 Phrase blocks processed: {self.stats['phrase_blocks_processed']}")
        print(f"📋 Table blocks processed: {self.stats['table_blocks_processed']}")
        print(f"✨ Total phrases analyzed: {self.stats['phrases_processed']}")
        print(f"✅ Successful extractions: {self.stats['successful_extractions']}")
        print(f"❌ Failed extractions: {self.stats['failed_extractions']}")
        print(f"⏭️ Skipped irrelevant: {self.stats['skipped_irrelevant']}")
        print(f"🔗 Total triples extracted: {self.stats['total_triples']}")

        if self.stats['phrases_processed'] > 0:
            success_rate = (self.stats['successful_extractions'] / self.stats['phrases_processed']) * 100
            print(f"📈 Success rate: {success_rate:.1f}%")

        print("=" * 70)


# ==============================================================================
# 🚀 MAIN EXECUTION SECTION
# ==============================================================================

# --- CONFIGURATION ---
# IMPORTANT: Change these paths to match your Google Drive folders.
DRIVE_INPUT_DIR = "processed_pdfs"      # Folder with *_phrase_optimized.json files
DRIVE_OUTPUT_DIR = "enhanced_graph_data" # Where enhanced results will be saved
OLLAMA_MODEL = "llama3:8b"              # UPDATED: The Ollama model you are running
REQUEST_DELAY = 0.5                     # Seconds to wait between API calls
MAX_RETRIES = 3                         # Number of times to retry a failed API call

def main():
    """Main execution function with enhanced error handling and logging."""
    print("🚀 Starting Enhanced Pharmaceutical Knowledge Graph Extractor")
    print("🧩 Optimized for phrase-based JSON processing")
    print("=" * 70)

    try:
        # Mount Google Drive
        print("🔧 Mounting Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        print("✅ Google Drive mounted successfully.")

        # Define paths
        drive_base_path = Path('/content/drive/MyDrive/')
        full_input_path = drive_base_path / DRIVE_INPUT_DIR
        full_output_path = drive_base_path / DRIVE_OUTPUT_DIR

        print(f"📁 Input Directory: {full_input_path}")
        print(f"📂 Output Directory: {full_output_path}")
        print(f"🤖 Model: {OLLAMA_MODEL}")
        print(f"🧩 Processing Method: Enhanced phrase-by-phrase")
        print(f"⏱️  Request Delay: {REQUEST_DELAY}s")
        print("=" * 70)

        # Validate input directory exists
        if not full_input_path.exists():
            print(f"❌ ERROR: Input directory not found at '{full_input_path}'")
            print("Please ensure you have run the phrase-based document parser first.")
            print("The input directory should contain *_phrase_optimized.json files.")
            return

        # Check for expected files
        json_files = list(full_input_path.glob('*_phrase_optimized.json'))
        if not json_files:
            print(f"⚠️  WARNING: No *_phrase_optimized.json files found in {full_input_path}")
            print("Please ensure you have run the enhanced document parser to generate phrase-based JSON files.")

            # Show what files are actually there
            all_files = list(full_input_path.glob('*.json'))
            if all_files:
                print(f"Found {len(all_files)} JSON files:")
                for f in all_files[:5]:  # Show first 5
                    print(f"  - {f.name}")
                if len(all_files) > 5:
                    print(f"  ... and {len(all_files) - 5} more")
            return

        print(f"🔍 Found {len(json_files)} phrase-optimized files to process:")
        for f in json_files:
            print(f"  - {f.name}")
        print()

        # Initialize the enhanced extractor
        print("🤖 Initializing enhanced knowledge extractor...")
        extractor = EnhancedPharmaceuticalKnowledgeExtractor(
            model_name=OLLAMA_MODEL,
            max_retries=MAX_RETRIES,
            request_delay=REQUEST_DELAY
        )

        # Process all files
        print("🚀 Starting batch processing...")
        extractor.process_directory(
            input_dir=full_input_path,
            output_dir=full_output_path
        )

        print("\n🎉 All files processed successfully!")
        print(f"📂 Results saved to: {full_output_path}")
        print("📊 Check the enhanced_final_extraction_report.json for detailed statistics.")

    except FileNotFoundError as e:
        print(f"❌ ERROR: Directory or file not found.")
        print(f"Details: {e}")
        print("\nTroubleshooting:")
        print("1. Ensure Google Drive is properly mounted")
        print("2. Check that the input directory path is correct")
        print("3. Verify that phrase-optimized JSON files exist")

    except ConnectionError as e:
        print(f"❌ ERROR: Could not connect to the Ollama server.")
        print("Please ensure your local Ollama instance is running and accessible.")
        print("If using Google Colab, you may need to use ngrok to tunnel the connection.")
        print(f"Details: {e}")

    except KeyboardInterrupt:
        print(f"\n⚠️  Processing interrupted by user.")
        print("Partial results may have been saved.")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        import traceback
        print("Full error traceback:")
        traceback.print_exc()

if __name__ == "__main__":
    main()

🚀 Starting Enhanced Pharmaceutical Knowledge Graph Extractor
🧩 Optimized for phrase-based JSON processing
🔧 Mounting Google Drive...
Mounted at /content/drive
✅ Google Drive mounted successfully.
📁 Input Directory: /content/drive/MyDrive/processed_pdfs
📂 Output Directory: /content/drive/MyDrive/enhanced_graph_data
🤖 Model: llama3:8b
🧩 Processing Method: Enhanced phrase-by-phrase
⏱️  Request Delay: 0.5s
Please ensure you have run the enhanced document parser to generate phrase-based JSON files.
Found 339 JSON files:
  - bula_1755192077396_llm_optimized.json
  - bula_1755192097944_llm_optimized.json
  - bula_1755195358088_llm_optimized.json
  - bula_1755195361693_llm_optimized.json
  - bula_1755195365369_llm_optimized.json
  ... and 334 more


In [None]:
#!/usr/bin/env python3
"""
Section-Aware Pharmaceutical Document Parser
Processes documents sentence by sentence while tracking document sections/headers
"""

import json
import re
import subprocess
import shlex
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import pymupdf4llm
import pdfplumber

class SectionAwarePharmaParser:
    def __init__(self, model_name: str = "llama3:8b"):
        """
        Initialize section-aware parser with header tracking
        """
        self.model_name = model_name
        self.raw_content = ""
        self.structured_data = {}
        self.document_loaded = False

        # Common pharmaceutical abbreviations that should NOT end sentences
        self.pharma_abbreviations = {
            'mg', 'ml', 'mcg', 'kg', 'g', 'l', 'dl', 'mmol', 'mol',  # Units
            'q.s.p', 'c.q.s', 'q.s', 'c.s.p',  # Pharmaceutical Latin
            'ltda', 'ltd', 'inc', 'corp', 'sa', 'co',  # Company abbreviations
            'dr', 'dra', 'prof', 'sr', 'sra',  # Titles
            'etc', 'ex', 'vs', 'e.g', 'i.e',  # Common abbreviations
            'cnpj', 'cpf', 'rg', 'crf', 'crm',  # Brazilian document types
            'anvisa', 'ms', 'rdc', 'vp', 'vps',  # Brazilian regulatory
            'd.d', 'p.ex', 'n°', 'nº'  # Other common abbreviations
        }

        # Brazilian pharmaceutical document section patterns
        self.section_patterns = {
            # Primary numbered sections
            r'^\s*I+\)\s*(.+)$': 'primary_section',  # I), II), III)
            r'^\s*\d+\.\s*(.+)$': 'numbered_section',  # 1., 2., 3.

            # Common pharmaceutical sections
            r'^\s*(IDENTIFICAÇÃO|IDENTIFICACAO)\s*(DO\s*MEDICAMENTO)?\s*$': 'identification',
            r'^\s*(INFORMAÇÕES|INFORMACOES)\s*(AO\s*PACIENTE)?\s*$': 'patient_info',
            r'^\s*(COMPOSIÇÃO|COMPOSICAO)\s*$': 'composition',
            r'^\s*(APRESENTAÇÕES|APRESENTACOES)\s*$': 'presentations',
            r'^\s*(INDICAÇÕES|INDICACOES)\s*$': 'indications',
            r'^\s*(CONTRAINDICAÇÕES|CONTRAINDICACOES)\s*$': 'contraindications',
            r'^\s*(PRECAUÇÕES|PRECAUCOES)\s*$': 'precautions',
            r'^\s*(REAÇÕES\s*ADVERSAS|REACOES\s*ADVERSAS|EFEITOS\s*ADVERSOS)\s*$': 'adverse_effects',
            r'^\s*(INTERAÇÕES|INTERACOES)\s*(MEDICAMENTOSAS)?\s*$': 'drug_interactions',
            r'^\s*(POSOLOGIA|DOSAGEM)\s*$': 'dosage',
            r'^\s*(SUPERDOSAGEM|SUPERDOSE)\s*$': 'overdose',
            r'^\s*ARMAZENAMENTO\s*$': 'storage',
            r'^\s*DIZERES\s*LEGAIS\s*$': 'legal_info',

            # Question-style headers
            r'^\s*\d+\.\s*(PARA\s*QUE|O\s*QUE|COMO|QUANDO|ONDE|QUAIS)\s*.*\?\s*$': 'question_header'
        }

        self.setup_ollama()

    def setup_ollama(self):
        """Setup Ollama model automatically"""
        print(f"Setting up Ollama model: {self.model_name}")
        try:
            subprocess.run(["ollama", "--version"], capture_output=True, check=True)
            print("✅ Ollama CLI found")
        except (FileNotFoundError, subprocess.CalledProcessError):
            raise RuntimeError("❌ Ollama CLI not found. Please install Ollama first.")

        try:
            print(f"Pulling model {self.model_name}...")
            result = subprocess.run(
                ["ollama", "pull", self.model_name],
                capture_output=True, text=True, timeout=300
            )
            if result.returncode == 0:
                print(f"✅ Model {self.model_name} ready")
        except Exception as e:
            print(f"⚠️ Error with model setup: {e}")

    def call_ollama_raw(self, prompt: str, extra_flags: str = "") -> str:
        """Call ollama with exact prompt"""
        cmd = ["ollama", "run", self.model_name]
        if extra_flags:
            cmd += shlex.split(extra_flags)

        try:
            proc = subprocess.run(
                cmd, input=prompt, text=True, capture_output=True, timeout=60
            )
            return proc.stdout.strip() or proc.stderr.strip()
        except subprocess.TimeoutExpired:
            raise RuntimeError("Ollama call timed out")
        except Exception as e:
            raise RuntimeError(f"Error calling ollama: {e}")

    def extract_pdf_content(self, pdf_path: str) -> str:
        """Extract text content from PDF"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text.append(page_text)
                if all_text:
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"pdfplumber failed: {e}")

        try:
            return pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            print(f"pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def detect_section_header(self, text: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Detect if text is a section header and return (section_type, section_title)
        """
        text_clean = text.strip()

        # Skip very short lines
        if len(text_clean) < 3:
            return None, None

        # Check against section patterns
        for pattern, section_type in self.section_patterns.items():
            match = re.match(pattern, text_clean, re.IGNORECASE)
            if match:
                if section_type == 'primary_section' or section_type == 'numbered_section':
                    section_title = match.group(1).strip()
                else:
                    section_title = text_clean
                return section_type, section_title

        # Check for all-caps headers (common in pharmaceutical docs)
        if (text_clean.isupper() and
            len(text_clean) > 5 and
            len(text_clean) < 100 and
            not re.search(r'\d{2,}', text_clean)):  # Not just numbers
            return 'caps_header', text_clean

        return None, None

    def is_likely_abbreviation(self, text: str) -> bool:
        """Check if text ending with period is likely an abbreviation"""
        if not text or len(text) < 2:
            return False

        word = text.rstrip('.').lower()

        if word in self.pharma_abbreviations:
            return True

        patterns = [
            r'^[a-z]{1,4}$',  # Short lowercase words
            r'^[A-Z]{2,6}$',  # All caps short words
            r'^[A-Z][a-z]{1,3}$',  # Capitalized short words
            r'^\d+[a-z]+$',  # Numbers with letters
            r'^[a-z]\.[a-z]',  # Pattern like q.s.p
            r'[0-9]$'  # Ends with number
        ]

        for pattern in patterns:
            if re.match(pattern, word):
                return True

        return False

    def smart_sentence_split_with_sections(self, text: str) -> List[Dict]:
        """
        Split text into sentences with section awareness
        Returns list of dicts with sentence and section info
        """
        print("📋 Splitting text with section tracking...")

        # First split by lines to identify headers
        lines = text.split('\n')

        current_section_type = 'unknown'
        current_section_title = 'Document Start'
        sentence_data = []
        current_sentence = ""

        for line_num, line in enumerate(lines):
            line = line.strip()

            if not line:  # Skip empty lines
                continue

            # Check if this line is a section header
            section_type, section_title = self.detect_section_header(line)

            if section_type and section_title:
                # This is a header - finish current sentence if any
                if current_sentence.strip():
                    sentences = self._split_sentence_safely(current_sentence)
                    for sent in sentences:
                        if sent.strip() and len(sent.strip()) > 10:
                            sentence_data.append({
                                'sentence': sent.strip(),
                                'section_type': current_section_type,
                                'section_title': current_section_title,
                                'line_number': line_num,
                                'is_header': False
                            })
                    current_sentence = ""

                # Update current section
                current_section_type = section_type
                current_section_title = section_title

                # Add header as special sentence
                sentence_data.append({
                    'sentence': line,
                    'section_type': section_type,
                    'section_title': section_title,
                    'line_number': line_num,
                    'is_header': True
                })

                print(f"📍 Section detected: {section_type} - {section_title}")

            else:
                # Regular content line - add to current sentence
                if current_sentence:
                    current_sentence += " " + line
                else:
                    current_sentence = line

        # Process any remaining sentence
        if current_sentence.strip():
            sentences = self._split_sentence_safely(current_sentence)
            for sent in sentences:
                if sent.strip() and len(sent.strip()) > 10:
                    sentence_data.append({
                        'sentence': sent.strip(),
                        'section_type': current_section_type,
                        'section_title': current_section_title,
                        'line_number': len(lines),
                        'is_header': False
                    })

        # Filter out headers from regular processing
        content_sentences = [s for s in sentence_data if not s['is_header']]

        print(f"✅ Found {len(sentence_data)} total items ({len(content_sentences)} content sentences)")
        print(f"📊 Sections identified: {len(set(s['section_title'] for s in sentence_data))}")

        return content_sentences

    def _split_sentence_safely(self, text: str) -> List[str]:
        """Split text into sentences with abbreviation awareness"""
        sentences = []
        current_sentence = ""

        # Split by potential sentence endings
        parts = re.split(r'([.!?]+)', text)

        i = 0
        while i < len(parts):
            if i % 2 == 0:  # Text part
                current_sentence += parts[i]
            else:  # Punctuation part
                punctuation = parts[i]
                current_sentence += punctuation

                if '.' in punctuation:
                    words = current_sentence.split()
                    if words:
                        last_word = words[-1]
                        if not self.is_likely_abbreviation(last_word):
                            if current_sentence.strip():
                                sentences.append(current_sentence.strip())
                            current_sentence = ""
                    else:
                        if current_sentence.strip():
                            sentences.append(current_sentence.strip())
                        current_sentence = ""
                else:
                    # ! or ? - definitely sentence endings
                    if current_sentence.strip():
                        sentences.append(current_sentence.strip())
                    current_sentence = ""
            i += 1

        # Add any remaining sentence
        if current_sentence.strip():
            sentences.append(current_sentence.strip())

        return [s for s in sentences if s.strip() and len(s.strip()) > 10]

    def create_section_aware_prompt(self, sentence_data: Dict) -> str:
        """Create a prompt that includes section context"""
        sentence = sentence_data['sentence']
        section_type = sentence_data['section_type']
        section_title = sentence_data['section_title']

        prompt = f"""Analyze this sentence from a Brazilian pharmaceutical document. The sentence comes from the "{section_title}" section.

RESPOND ONLY WITH VALID JSON. No explanations, no markdown.

Context: This sentence is from the {section_type} section titled "{section_title}".

Extract relevant pharmaceutical information considering the section context.

Format:
{{
  "entities": [
    {{"type": "medication_name", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "dosage", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "indication", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "contraindication", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "side_effect", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "manufacturer", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "storage", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "administration", "value": "...", "confidence": "high|medium|low"}}
  ],
  "section_relevance": "high|medium|low",
  "key_info_found": true/false
}}

Sentence: "{sentence}"

JSON:"""

        return prompt

    def parse_json_response(self, response: str) -> Any:
        """Enhanced JSON parsing with better error handling"""
        if not response or not response.strip():
            return None

        cleaned = response.strip()

        # Remove markdown code blocks
        if "```json" in cleaned:
            start = cleaned.find("```json") + 7
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()
        elif "```" in cleaned:
            start = cleaned.find("```") + 3
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()

        # Find JSON boundaries
        json_start = cleaned.find('{')
        json_end = cleaned.rfind('}') + 1

        if json_start != -1 and json_end > json_start:
            cleaned = cleaned[json_start:json_end]

        try:
            return json.loads(cleaned)
        except json.JSONDecodeError as e:
            try:
                # Fix common issues
                fixed = re.sub(r',(\s*[}\]])', r'\1', cleaned)
                fixed = re.sub(r'(?<!\\)"(?=[^,}\]]*[,}\]])', r'\\"', fixed)
                return json.loads(fixed)
            except:
                print(f"JSON parse error: {cleaned[:100]}...")
                return None

    def analyze_sentence_with_context(self, sentence_data: Dict, index: int) -> Dict:
        """Analyze a sentence with section context"""
        try:
            prompt = self.create_section_aware_prompt(sentence_data)
            response = self.call_ollama_raw(prompt)

            result = self.parse_json_response(response)

            if result and isinstance(result, dict):
                # Add metadata with null checks
                result['sentence_index'] = index
                result['original_sentence'] = sentence_data.get('sentence', '')
                result['section_type'] = sentence_data.get('section_type', 'unknown')
                result['section_title'] = sentence_data.get('section_title', 'Unknown Section')
                result['line_number'] = sentence_data.get('line_number', 0)
                return result
            else:
                return self._create_empty_result(sentence_data, index, 'parsing_error')

        except Exception as e:
            print(f"Error analyzing sentence {index}: {e}")
            return self._create_empty_result(sentence_data, index, f'analysis_error: {e}')

    def _create_empty_result(self, sentence_data: Dict, index: int, error_type: str = None) -> Dict:
        """Create empty result structure with safe defaults"""
        return {
            'entities': [],
            'section_relevance': 'low',
            'key_info_found': False,
            'sentence_index': index,
            'original_sentence': sentence_data.get('sentence', ''),
            'section_type': sentence_data.get('section_type', 'unknown'),
            'section_title': sentence_data.get('section_title', 'Unknown Section'),
            'line_number': sentence_data.get('line_number', 0),
            'error': error_type
        }

    def process_sentences_with_context(self, sentence_data_list: List[Dict]) -> List[Dict]:
        """Process all sentences with section context"""
        print(f"🔍 Analyzing {len(sentence_data_list)} sentences with section context...")

        analyses = []
        successful_analyses = 0

        for i, sentence_data in enumerate(sentence_data_list):
            sentence = sentence_data.get('sentence', '')[:60]
            section = sentence_data.get('section_title', 'Unknown')

            print(f"Processing {i+1}/{len(sentence_data_list)} in [{section}]: {sentence}...")

            analysis = self.analyze_sentence_with_context(sentence_data, i)
            analyses.append(analysis)

            if analysis.get('key_info_found') and not analysis.get('error'):
                successful_analyses += 1
                entity_count = len(analysis.get('entities', []))
                if entity_count > 0:
                    print(f"  ✅ Found {entity_count} entities")

            import time
            time.sleep(0.1)

        print(f"✅ Completed analysis: {successful_analyses}/{len(sentence_data_list)} sentences with entities")
        return analyses

    def aggregate_entities_by_section(self, analyses: List[Dict]) -> Dict:
        """Aggregate entities by section with null-safe processing"""
        print("📊 Aggregating entities by section...")

        section_entities = {}
        all_entities = []
        section_stats = {}

        for analysis in analyses:
            if not analysis or analysis.get('error'):
                continue

            section_title = analysis.get('section_title', 'Unknown Section')

            # Initialize section if not exists
            if section_title not in section_entities:
                section_entities[section_title] = {
                    'medication_names': set(), 'dosages': set(), 'indications': set(),
                    'contraindications': set(), 'side_effects': set(), 'manufacturers': set(),
                    'storage_conditions': set(), 'administration_info': set()
                }
                section_stats[section_title] = {'sentences': 0, 'entities': 0}

            section_stats[section_title]['sentences'] += 1

            entities = analysis.get('entities', [])
            if not entities:
                continue

            section_stats[section_title]['entities'] += len(entities)

            for entity in entities:
                # --- START: FIX ---
                # 1. Check if the entity is a valid dictionary
                if not isinstance(entity, dict):
                    continue

                # 2. Get the type and value, which could be None
                entity_type = entity.get('type')
                entity_value = entity.get('value')

                # 3. Ensure both type and value are not None or empty before stripping
                if not entity_type or not entity_value:
                    continue

                entity_type = entity_type.strip()
                entity_value = str(entity_value).strip() # Convert to string to be safe
                # --- END: FIX ---

                type_mapping = {
                    'medication_name': 'medication_names', 'dosage': 'dosages',
                    'indication': 'indications', 'contraindication': 'contraindications',
                    'side_effect': 'side_effects', 'manufacturer': 'manufacturers',
                    'storage': 'storage_conditions', 'administration': 'administration_info'
                }

                if entity_type in type_mapping:
                    collection_key = type_mapping[entity_type]
                    section_entities[section_title][collection_key].add(entity_value)

                    all_entities.append({
                        'type': entity_type, 'value': entity_value,
                        'section': section_title,
                        'confidence': entity.get('confidence', 'medium'),
                        'sentence_index': analysis.get('sentence_index', -1)
                    })

        # Convert sets to lists for JSON serialization
        for section in section_entities:
            for entity_type in section_entities[section]:
                section_entities[section][entity_type] = list(section_entities[section][entity_type])

        result = {
            'entities_by_section': section_entities, 'all_entities': all_entities,
            'section_statistics': section_stats, 'total_entities': len(all_entities),
            'sections_processed': len(section_entities)
        }

        print(f"✅ Aggregated {len(all_entities)} entities across {len(section_entities)} sections")
        return result

    def process_document(self, pdf_path: str) -> bool:
        """Main document processing with section awareness"""
        print(f"📄 Processing document with section-aware analysis: {pdf_path}")

        if not Path(pdf_path).exists():
            print(f"❌ File not found: {pdf_path}")
            return False

        try:
            # Extract text
            print("📖 Extracting text from PDF...")
            self.raw_content = self.extract_pdf_content(pdf_path)

            if not self.raw_content:
                print("❌ No text content extracted")
                return False

            print(f"✅ Extracted {len(self.raw_content)} characters")

            # Split with section awareness
            sentence_data_list = self.smart_sentence_split_with_sections(self.raw_content)

            # Process sentences with context
            analyses = self.process_sentences_with_context(sentence_data_list)

            # Aggregate by sections
            aggregated_data = self.aggregate_entities_by_section(analyses)

            # Compile results
            self.structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "total_text_length": len(self.raw_content),
                    "model_used": self.model_name,
                    "extraction_method": "section_aware_sentence_analysis"
                },
                "sentence_analyses": analyses,
                "section_entities": aggregated_data,
                "processing_statistics": {
                    "total_sentences": len(sentence_data_list),
                    "sentences_with_entities": len([a for a in analyses if a.get('key_info_found')]),
                    "total_entities_found": aggregated_data.get('total_entities', 0),
                    "sections_identified": aggregated_data.get('sections_processed', 0)
                }
            }

            self.document_loaded = True
            print("✅ Section-aware document processing completed!")
            self._show_processing_summary()
            return True

        except Exception as e:
            print(f"❌ Error processing document: {e}")
            import traceback
            traceback.print_exc()
            return False

    def _show_processing_summary(self):
        """Show processing summary with section information"""
        print("\n" + "=" * 70)
        print("📊 SECTION-AWARE PROCESSING SUMMARY")
        print("=" * 70)

        metadata = self.structured_data.get("metadata", {})
        stats = self.structured_data.get("processing_statistics", {})
        section_data = self.structured_data.get("section_entities", {})

        print(f"📁 File: {metadata.get('file_name', 'Unknown')}")
        print(f"📝 Text length: {metadata.get('total_text_length', 0):,} characters")
        print(f"✂️ Total sentences: {stats.get('total_sentences', 0)}")
        print(f"🏷️ Sentences with entities: {stats.get('sentences_with_entities', 0)}")
        print(f"🔢 Total entities found: {stats.get('total_entities_found', 0)}")
        print(f"📋 Sections identified: {stats.get('sections_identified', 0)}")

        # Show section statistics
        section_stats = section_data.get('section_statistics', {})
        if section_stats:
            print(f"\n📊 Entity Distribution by Section:")
            for section, stat in section_stats.items():
                print(f"   {section}: {stat['entities']} entities from {stat['sentences']} sentences")

        print("=" * 70)

    def save_results(self, output_path: Optional[str] = None) -> str:
        """Save processing results"""
        if not self.document_loaded:
            raise Exception("No document processed")

        if not output_path:
            file_name = self.structured_data["metadata"]["file_name"]
            pdf_name = Path(file_name).stem
            output_path = f"{pdf_name}_section_aware_analysis.json"

        output_file = Path(output_path)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.structured_data, f, indent=2, ensure_ascii=False)

        print(f"\n📄 Results saved to: {output_file}")
        print(f"📊 File size: {output_file.stat().st_size:,} bytes")
        return str(output_file)

    def query_document(self, question: str) -> str:
        """Query with section context"""
        if not self.document_loaded:
            return "❌ No document processed."

        section_entities = self.structured_data.get("section_entities", {}).get("entities_by_section", {})

        context_parts = ["MEDICATION INFORMATION BY SECTION:\n"]

        for section, entities in section_entities.items():
            if any(entities.values()):  # Only show sections with entities
                context_parts.append(f"[{section}]")
                for entity_type, values in entities.items():
                    if values:
                        context_parts.append(f"  {entity_type}: {', '.join(values[:3])}")
                context_parts.append("")

        context = "\n".join(context_parts)

        query_prompt = f"""Answer about this medication based on the section-organized information.

Question: {question}

Available Information:
{context[:4000]}

Provide a clear answer in Portuguese, mentioning the relevant sections when appropriate."""

        try:
            response = self.call_ollama_raw(query_prompt)
            return response.strip() if response else "No response received"
        except Exception as e:
            return f"❌ Error: {e}"

def main():
    """Demonstrate section-aware processing"""
    parser = SectionAwarePharmaParser(model_name="llama3:8b")

    pdf_path = "bula_1755192077396.pdf"

    print("🎯 Section-Aware Pharmaceutical Document Parser")
    print("=" * 70)

    if parser.process_document(pdf_path):
        results_file = parser.save_results()

        print(f"\n🔍 Testing section-aware querying...")
        test_questions = [
            "Qual é o nome do medicamento e sua concentração?",
            "Quais são as contraindicações principais?",
            "Como deve ser administrado?",
            "Quais são os efeitos adversos mais comuns?",
            "Quem é o fabricante?"
        ]

        for question in test_questions:
            print(f"\n❓ {question}")
            answer = parser.query_document(question)
            print(f"💡 {answer}")
            print("-" * 50)

if __name__ == "__main__":
    main()

Setting up Ollama model: llama3:8b
✅ Ollama CLI found
Pulling model llama3:8b...
✅ Model llama3:8b ready
🎯 Section-Aware Pharmaceutical Document Parser
📄 Processing document with section-aware analysis: bula_1755192077396.pdf
📖 Extracting text from PDF...
✅ Extracted 11936 characters
📋 Splitting text with section tracking...
📍 Section detected: primary_section - IDENTIFICAÇÃO DO MEDICAMENTO
📍 Section detected: presentations - APRESENTAÇÕES
📍 Section detected: caps_header - USO ORAL
📍 Section detected: caps_header - USO ADULTO E PEDIÁTRICO ACIMA DE 6 ANOS DE IDADE
📍 Section detected: composition - COMPOSIÇÃO
📍 Section detected: primary_section - INFORMAÇÕES AO PACIENTE
📍 Section detected: numbered_section - PARA QUE ESTE MEDICAMENTO É INDICADO?
📍 Section detected: numbered_section - COMO ESTE MEDICAMENTO FUNCIONA?
📍 Section detected: numbered_section - QUANDO NÃO DEVO USAR ESTE MEDICAMENTO?
📍 Section detected: numbered_section - O QUE DEVO SABER ANTES DE USAR ESTE MEDICAMENTO?
📍 Section

In [None]:
# Install necessary libraries
!pip install pymupdf4llm pdfplumber pandas requests -q

# Download and install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

import subprocess
import time

# Start the ollama server process in the background
# Its output will be redirected to a log file
server_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=open("ollama_server.log", "w"),
    stderr=subprocess.STDOUT
)

print("✅ Ollama server started in the background.")
time.sleep(5) # Give the server a moment to initialize

# --- CHANGE MADE HERE ---
# Pull the Llama 3 8B model instead of the 3B version.
print("📥 Pulling the Llama 3 8B model. This may take a few minutes...")
!ollama pull llama3:8b
print("✅ Model download complete.")

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
✅ Ollama server started in the background.
📥 Pulling the Llama 3 8B model. This may take a few minutes...
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l
✅ Model download complete.


In [None]:
#!/usr/bin/env python3
"""
Enhanced Section-Aware Pharmaceutical Document Parser
Improved entity extraction with validation, retry logic, and better prompting
"""

import json
import re
import subprocess
import shlex
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import pymupdf4llm
import pdfplumber
from dataclasses import dataclass
from enum import Enum

class ConfidenceLevel(Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

@dataclass
class Entity:
    """Structured entity with validation"""
    type: str
    value: str
    confidence: ConfidenceLevel
    context: Optional[str] = None

    def __post_init__(self):
        if not self.value or not self.value.strip():
            raise ValueError("Entity value cannot be empty")
        if not self.type or not self.type.strip():
            raise ValueError("Entity type cannot be empty")

        self.value = self.value.strip()
        self.type = self.type.strip()

class EnhancedSectionAwarePharmaParser:
    def __init__(self, model_name: str = "llama3:8b", max_retries: int = 2):
        """
        Initialize enhanced parser with retry logic
        """
        self.model_name = model_name
        self.max_retries = max_retries
        self.raw_content = ""
        self.structured_data = {}
        self.document_loaded = False

        # Enhanced entity types for pharmaceutical documents
        self.entity_types = {
            'medication_name': {
                'description': 'Active ingredient or brand name of the medication',
                'examples': ['ezetimiba', 'Atorvastatina', 'Sinvastatina']
            },
            'dosage_strength': {
                'description': 'Concentration or strength of the medication',
                'examples': ['10 mg', '20 mg/ml', '500 mcg']
            },
            'posology': {
                'description': 'How and when to take the medication',
                'examples': ['1 comprimido ao dia', 'tomar com alimentos', 'antes das refeições']
            },
            'indication': {
                'description': 'Medical condition the medication treats',
                'examples': ['hipercolesterolemia', 'redução do colesterol', 'prevenção cardiovascular']
            },
            'contraindication': {
                'description': 'Conditions where medication should not be used',
                'examples': ['gravidez', 'insuficiência hepática', 'hipersensibilidade']
            },
            'mechanism_of_action': {
                'description': 'How the medication works in the body',
                'examples': ['inibição da absorção do colesterol', 'bloqueio de receptores']
            },
            'adverse_effect': {
                'description': 'Side effects or unwanted reactions',
                'examples': ['dor de cabeça', 'náusea', 'dor muscular']
            },
            'adverse_effect_frequency': {
                'description': 'How common side effects are',
                'examples': ['muito comum', 'raro', 'muito raro']
            },
            'drug_interaction': {
                'description': 'Other medications that interact with this drug',
                'examples': ['warfarina', 'ciclosporina', 'fibratos']
            },
            'patient_population': {
                'description': 'Specific patient groups mentioned',
                'examples': ['idosos', 'crianças', 'pacientes com diabetes']
            },
            'manufacturer': {
                'description': 'Company that makes the medication',
                'examples': ['Sandoz', 'EMS', 'Medley']
            },
            'storage_condition': {
                'description': 'How to store the medication',
                'examples': ['temperatura ambiente', 'proteger da luz', 'geladeira']
            },
            'presentation': {
                'description': 'How the medication is packaged',
                'examples': ['comprimidos', 'caixa com 30 unidades', 'frasco de 100ml']
            }
        }

        # Common pharmaceutical abbreviations
        self.pharma_abbreviations = {
            'mg', 'ml', 'mcg', 'kg', 'g', 'l', 'dl', 'mmol', 'mol',
            'q.s.p', 'c.q.s', 'q.s', 'c.s.p',
            'ltda', 'ltd', 'inc', 'corp', 'sa', 'co',
            'dr', 'dra', 'prof', 'sr', 'sra',
            'etc', 'ex', 'vs', 'e.g', 'i.e',
            'cnpj', 'cpf', 'rg', 'crf', 'crm',
            'anvisa', 'ms', 'rdc', 'vp', 'vps',
            'd.d', 'p.ex', 'n°', 'nº'
        }

        # Brazilian pharmaceutical document section patterns
        self.section_patterns = {
            r'^\s*I+\)\s*(.+)$': 'primary_section',
            r'^\s*\d+\.\s*(.+)$': 'numbered_section',
            r'^\s*(IDENTIFICAÇÃO|IDENTIFICACAO)\s*(DO\s*MEDICAMENTO)?\s*$': 'identification',
            r'^\s*(INFORMAÇÕES|INFORMACOES)\s*(AO\s*PACIENTE)?\s*$': 'patient_info',
            r'^\s*(COMPOSIÇÃO|COMPOSICAO)\s*$': 'composition',
            r'^\s*(APRESENTAÇÕES|APRESENTACOES)\s*$': 'presentations',
            r'^\s*(INDICAÇÕES|INDICACOES)\s*$': 'indications',
            r'^\s*(CONTRAINDICAÇÕES|CONTRAINDICACOES)\s*$': 'contraindications',
            r'^\s*(PRECAUÇÕES|PRECAUCOES)\s*$': 'precautions',
            r'^\s*(REAÇÕES\s*ADVERSAS|REACOES\s*ADVERSAS|EFEITOS\s*ADVERSOS)\s*$': 'adverse_effects',
            r'^\s*(INTERAÇÕES|INTERACOES)\s*(MEDICAMENTOSAS)?\s*$': 'drug_interactions',
            r'^\s*(POSOLOGIA|DOSAGEM)\s*$': 'dosage',
            r'^\s*(SUPERDOSAGEM|SUPERDOSE)\s*$': 'overdose',
            r'^\s*ARMAZENAMENTO\s*$': 'storage',
            r'^\s*DIZERES\s*LEGAIS\s*$': 'legal_info',
            r'^\s*\d+\.\s*(PARA\s*QUE|O\s*QUE|COMO|QUANDO|ONDE|QUAIS)\s*.*\?\s*$': 'question_header'
        }

        self.setup_ollama()

    def setup_ollama(self):
        """Setup Ollama model automatically"""
        print(f"Setting up Ollama model: {self.model_name}")
        try:
            subprocess.run(["ollama", "--version"], capture_output=True, check=True)
            print("Ollama CLI found")
        except (FileNotFoundError, subprocess.CalledProcessError):
            raise RuntimeError("Ollama CLI not found. Please install Ollama first.")

        try:
            print(f"Pulling model {self.model_name}...")
            result = subprocess.run(
                ["ollama", "pull", self.model_name],
                capture_output=True, text=True, timeout=300
            )
            if result.returncode == 0:
                print(f"Model {self.model_name} ready")
        except Exception as e:
            print(f"Error with model setup: {e}")

    def call_ollama_raw(self, prompt: str, extra_flags: str = "") -> str:
        """Call ollama with exact prompt"""
        cmd = ["ollama", "run", self.model_name]
        if extra_flags:
            cmd += shlex.split(extra_flags)

        try:
            proc = subprocess.run(
                cmd, input=prompt, text=True, capture_output=True, timeout=120
            )
            return proc.stdout.strip() or proc.stderr.strip()
        except subprocess.TimeoutExpired:
            raise RuntimeError("Ollama call timed out")
        except Exception as e:
            raise RuntimeError(f"Error calling ollama: {e}")

    def create_enhanced_prompt(self, sentence_data: Dict) -> str:
        """Create an enhanced prompt with detailed entity descriptions and examples"""
        sentence = sentence_data['sentence']
        section_type = sentence_data['section_type']
        section_title = sentence_data['section_title']

        # Build entity type documentation
        entity_docs = []
        for entity_type, info in self.entity_types.items():
            examples_str = ', '.join(f'"{ex}"' for ex in info['examples'][:2])
            entity_docs.append(f'- {entity_type}: {info["description"]} (examples: {examples_str})')

        entity_documentation = '\n'.join(entity_docs)

        prompt = f"""You are analyzing a sentence from a Brazilian pharmaceutical document (bula).

SECTION CONTEXT: This sentence is from "{section_title}" ({section_type})

ENTITY TYPES TO EXTRACT:
{entity_documentation}

EXTRACTION RULES:
1. Only extract entities that are EXPLICITLY mentioned in the sentence
2. Do not infer or assume information not directly stated
3. Extract the exact text, don't paraphrase
4. Set confidence based on clarity: "high" for clear/direct mentions, "medium" for somewhat clear, "low" for uncertain
5. If no entities found, return empty entities array

SENTENCE TO ANALYZE: "{sentence}"

RESPOND WITH VALID JSON ONLY (no explanations, no markdown):
{{
  "entities": [
    {{"type": "entity_type", "value": "exact_text_from_sentence", "confidence": "high|medium|low"}}
  ],
  "section_relevance": "high|medium|low",
  "key_info_found": true/false
}}

JSON:"""

        return prompt

    def validate_and_clean_entities(self, entities: List[Dict]) -> List[Entity]:
        """Validate and clean extracted entities"""
        validated_entities = []

        for entity_dict in entities:
            try:
                # Basic validation
                if not isinstance(entity_dict, dict):
                    continue

                entity_type = entity_dict.get('type', '').strip()
                entity_value = entity_dict.get('value', '').strip()
                confidence_str = entity_dict.get('confidence', 'medium').strip().lower()

                # Skip empty or invalid entities
                if not entity_type or not entity_value:
                    continue

                # Validate entity type
                if entity_type not in self.entity_types:
                    continue

                # Clean and validate confidence
                try:
                    confidence = ConfidenceLevel(confidence_str)
                except ValueError:
                    confidence = ConfidenceLevel.MEDIUM

                # Additional cleaning
                entity_value = self.clean_entity_value(entity_value)
                if not entity_value:
                    continue

                # Create validated entity
                entity = Entity(
                    type=entity_type,
                    value=entity_value,
                    confidence=confidence
                )

                validated_entities.append(entity)

            except Exception as e:
                print(f"Error validating entity {entity_dict}: {e}")
                continue

        return validated_entities

    def clean_entity_value(self, value: str) -> str:
        """Clean entity values"""
        if not value:
            return ""

        # Remove extra whitespace
        cleaned = re.sub(r'\s+', ' ', value.strip())

        # Remove trailing periods (except for abbreviations)
        if cleaned.endswith('.') and len(cleaned) > 4:
            cleaned = cleaned.rstrip('.')

        # Remove quotation marks
        cleaned = cleaned.strip('"\'')

        # Skip very short or meaningless values
        if len(cleaned) < 2:
            return ""

        return cleaned

    def parse_json_response_with_retry(self, response: str) -> Optional[Dict]:
        """Enhanced JSON parsing with better error handling"""
        if not response or not response.strip():
            return None

        cleaned = response.strip()

        # Remove markdown code blocks
        if "```json" in cleaned:
            start = cleaned.find("```json") + 7
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()
        elif "```" in cleaned:
            start = cleaned.find("```") + 3
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()

        # Find JSON boundaries
        json_start = cleaned.find('{')
        json_end = cleaned.rfind('}') + 1

        if json_start != -1 and json_end > json_start:
            cleaned = cleaned[json_start:json_end]

        try:
            parsed = json.loads(cleaned)

            # Validate structure
            if not isinstance(parsed, dict):
                return None

            if 'entities' not in parsed:
                return None

            if not isinstance(parsed['entities'], list):
                return None

            return parsed

        except json.JSONDecodeError:
            try:
                # Try to fix common JSON issues
                fixed = re.sub(r',(\s*[}\]])', r'\1', cleaned)  # Remove trailing commas
                fixed = re.sub(r'(\w+):', r'"\1":', fixed)  # Add quotes to keys
                fixed = re.sub(r': *([^",\[\{][^,}\]]*)', r': "\1"', fixed)  # Add quotes to values
                return json.loads(fixed)
            except:
                return None

    def analyze_sentence_with_retries(self, sentence_data: Dict, index: int) -> Dict:
        """Analyze sentence with retry logic for failed extractions"""

        for attempt in range(self.max_retries + 1):
            try:
                if attempt == 0:
                    prompt = self.create_enhanced_prompt(sentence_data)
                else:
                    # Modified prompt for retry
                    original_prompt = self.create_enhanced_prompt(sentence_data)
                    prompt = f"{original_prompt}\n\nNOTE: Previous attempt failed. Please ensure your response is valid JSON with the exact structure shown above."

                response = self.call_ollama_raw(prompt)
                result = self.parse_json_response_with_retry(response)

                if result and isinstance(result, dict) and 'entities' in result:
                    # Validate and clean entities
                    raw_entities = result.get('entities', [])
                    validated_entities = self.validate_and_clean_entities(raw_entities)

                    # Convert back to dict format for consistency
                    cleaned_entities = []
                    for entity in validated_entities:
                        cleaned_entities.append({
                            'type': entity.type,
                            'value': entity.value,
                            'confidence': entity.confidence.value
                        })

                    # Build final result
                    final_result = {
                        'entities': cleaned_entities,
                        'section_relevance': result.get('section_relevance', 'medium'),
                        'key_info_found': len(cleaned_entities) > 0,
                        'sentence_index': index,
                        'original_sentence': sentence_data.get('sentence', ''),
                        'section_type': sentence_data.get('section_type', 'unknown'),
                        'section_title': sentence_data.get('section_title', 'Unknown Section'),
                        'line_number': sentence_data.get('line_number', 0),
                        'extraction_attempts': attempt + 1
                    }

                    if len(cleaned_entities) > 0:
                        print(f"  Success on attempt {attempt + 1}: {len(cleaned_entities)} entities")

                    return final_result

            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt == self.max_retries:
                    break
                continue

        # All attempts failed
        print(f"  All {self.max_retries + 1} attempts failed")
        return self._create_empty_result(sentence_data, index, 'extraction_failed_all_attempts')

    def extract_pdf_content(self, pdf_path: str) -> str:
        """Extract text content from PDF"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text.append(page_text)
                if all_text:
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"pdfplumber failed: {e}")

        try:
            return pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            print(f"pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def detect_section_header(self, text: str) -> Tuple[Optional[str], Optional[str]]:
        """Detect if text is a section header"""
        text_clean = text.strip()

        if len(text_clean) < 3:
            return None, None

        for pattern, section_type in self.section_patterns.items():
            match = re.match(pattern, text_clean, re.IGNORECASE)
            if match:
                if section_type == 'primary_section' or section_type == 'numbered_section':
                    section_title = match.group(1).strip()
                else:
                    section_title = text_clean
                return section_type, section_title

        if (text_clean.isupper() and
            len(text_clean) > 5 and
            len(text_clean) < 100 and
            not re.search(r'\d{2,}', text_clean)):
            return 'caps_header', text_clean

        return None, None

    def is_likely_abbreviation(self, text: str) -> bool:
        """Check if text ending with period is likely an abbreviation"""
        if not text or len(text) < 2:
            return False

        word = text.rstrip('.').lower()

        if word in self.pharma_abbreviations:
            return True

        patterns = [
            r'^[a-z]{1,4}$',
            r'^[A-Z]{2,6}$',
            r'^[A-Z][a-z]{1,3}$',
            r'^\d+[a-z]+$',
            r'^[a-z]\.[a-z]',
            r'[0-9]$'
        ]

        for pattern in patterns:
            if re.match(pattern, word):
                return True

        return False

    def smart_sentence_split_with_sections(self, text: str) -> List[Dict]:
        """Split text into sentences with section awareness"""
        print("Splitting text with section tracking...")

        lines = text.split('\n')
        current_section_type = 'unknown'
        current_section_title = 'Document Start'
        sentence_data = []
        current_sentence = ""

        for line_num, line in enumerate(lines):
            line = line.strip()

            if not line:
                continue

            section_type, section_title = self.detect_section_header(line)

            if section_type and section_title:
                if current_sentence.strip():
                    sentences = self._split_sentence_safely(current_sentence)
                    for sent in sentences:
                        if sent.strip() and len(sent.strip()) > 10:
                            sentence_data.append({
                                'sentence': sent.strip(),
                                'section_type': current_section_type,
                                'section_title': current_section_title,
                                'line_number': line_num,
                                'is_header': False
                            })
                    current_sentence = ""

                current_section_type = section_type
                current_section_title = section_title

                sentence_data.append({
                    'sentence': line,
                    'section_type': section_type,
                    'section_title': section_title,
                    'line_number': line_num,
                    'is_header': True
                })

                print(f"Section detected: {section_type} - {section_title}")

            else:
                if current_sentence:
                    current_sentence += " " + line
                else:
                    current_sentence = line

        if current_sentence.strip():
            sentences = self._split_sentence_safely(current_sentence)
            for sent in sentences:
                if sent.strip() and len(sent.strip()) > 10:
                    sentence_data.append({
                        'sentence': sent.strip(),
                        'section_type': current_section_type,
                        'section_title': current_section_title,
                        'line_number': len(lines),
                        'is_header': False
                    })

        content_sentences = [s for s in sentence_data if not s['is_header']]
        print(f"Found {len(sentence_data)} total items ({len(content_sentences)} content sentences)")

        return content_sentences

    def _split_sentence_safely(self, text: str) -> List[str]:
        """Split text into sentences with abbreviation awareness"""
        sentences = []
        current_sentence = ""

        parts = re.split(r'([.!?]+)', text)

        i = 0
        while i < len(parts):
            if i % 2 == 0:
                current_sentence += parts[i]
            else:
                punctuation = parts[i]
                current_sentence += punctuation

                if '.' in punctuation:
                    words = current_sentence.split()
                    if words:
                        last_word = words[-1]
                        if not self.is_likely_abbreviation(last_word):
                            if current_sentence.strip():
                                sentences.append(current_sentence.strip())
                            current_sentence = ""
                    else:
                        if current_sentence.strip():
                            sentences.append(current_sentence.strip())
                        current_sentence = ""
                else:
                    if current_sentence.strip():
                        sentences.append(current_sentence.strip())
                    current_sentence = ""
            i += 1

        if current_sentence.strip():
            sentences.append(current_sentence.strip())

        return [s for s in sentences if s.strip() and len(s.strip()) > 10]

    def _create_empty_result(self, sentence_data: Dict, index: int, error_type: str = None) -> Dict:
        """Create empty result structure"""
        return {
            'entities': [],
            'section_relevance': 'low',
            'key_info_found': False,
            'sentence_index': index,
            'original_sentence': sentence_data.get('sentence', ''),
            'section_type': sentence_data.get('section_type', 'unknown'),
            'section_title': sentence_data.get('section_title', 'Unknown Section'),
            'line_number': sentence_data.get('line_number', 0),
            'error': error_type,
            'extraction_attempts': self.max_retries + 1
        }

    def process_sentences_with_enhanced_extraction(self, sentence_data_list: List[Dict]) -> List[Dict]:
        """Process all sentences with enhanced extraction"""
        print(f"Analyzing {len(sentence_data_list)} sentences with enhanced extraction...")

        analyses = []
        successful_analyses = 0
        total_entities = 0

        for i, sentence_data in enumerate(sentence_data_list):
            sentence = sentence_data.get('sentence', '')[:60]
            section = sentence_data.get('section_title', 'Unknown')

            print(f"Processing {i+1}/{len(sentence_data_list)} in [{section}]: {sentence}...")

            analysis = self.analyze_sentence_with_retries(sentence_data, i)
            analyses.append(analysis)

            if analysis.get('key_info_found') and not analysis.get('error'):
                successful_analyses += 1
                entity_count = len(analysis.get('entities', []))
                total_entities += entity_count

            import time
            time.sleep(0.1)

        print(f"Completed: {successful_analyses}/{len(sentence_data_list)} sentences with entities")
        print(f"Total entities extracted: {total_entities}")

        return analyses

    def aggregate_entities_by_section(self, analyses: List[Dict]) -> Dict:
        """Aggregate entities by section with improved processing"""
        print("Aggregating entities by section...")

        section_entities = {}
        all_entities = []
        section_stats = {}

        for analysis in analyses:
            if not analysis or analysis.get('error'):
                continue

            section_title = analysis.get('section_title', 'Unknown Section')

            if section_title not in section_entities:
                section_entities[section_title] = {}
                for entity_type in self.entity_types.keys():
                    section_entities[section_title][entity_type] = set()
                section_stats[section_title] = {'sentences': 0, 'entities': 0}

            section_stats[section_title]['sentences'] += 1

            entities = analysis.get('entities', [])
            if not entities:
                continue

            section_stats[section_title]['entities'] += len(entities)

            for entity in entities:
                if not isinstance(entity, dict):
                    continue

                entity_type = entity.get('type', '').strip()
                entity_value = entity.get('value', '').strip()

                if not entity_type or not entity_value:
                    continue

                if entity_type in self.entity_types:
                    section_entities[section_title][entity_type].add(entity_value)

                    all_entities.append({
                        'type': entity_type,
                        'value': entity_value,
                        'section': section_title,
                        'confidence': entity.get('confidence', 'medium'),
                        'sentence_index': analysis.get('sentence_index', -1)
                    })

        # Convert sets to lists for JSON serialization
        for section in section_entities:
            for entity_type in section_entities[section]:
                section_entities[section][entity_type] = list(section_entities[section][entity_type])

        result = {
            'entities_by_section': section_entities,
            'all_entities': all_entities,
            'section_statistics': section_stats,
            'total_entities': len(all_entities),
            'sections_processed': len(section_entities)
        }

        print(f"Aggregated {len(all_entities)} entities across {len(section_entities)} sections")
        return result

    def process_document(self, pdf_path: str) -> bool:
        """Main document processing with enhanced extraction"""
        print(f"Processing document with enhanced extraction: {pdf_path}")

        if not Path(pdf_path).exists():
            print(f"File not found: {pdf_path}")
            return False

        try:
            print("Extracting text from PDF...")
            self.raw_content = self.extract_pdf_content(pdf_path)

            if not self.raw_content:
                print("No text content extracted")
                return False

            print(f"Extracted {len(self.raw_content)} characters")

            sentence_data_list = self.smart_sentence_split_with_sections(self.raw_content)
            analyses = self.process_sentences_with_enhanced_extraction(sentence_data_list)
            aggregated_data = self.aggregate_entities_by_section(analyses)

            self.structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "total_text_length": len(self.raw_content),
                    "model_used": self.model_name,
                    "extraction_method": "enhanced_section_aware_extraction",
                    "max_retries": self.max_retries
                },
                "sentence_analyses": analyses,
                "section_entities": aggregated_data,
                "processing_statistics": {
                    "total_sentences": len(sentence_data_list),
                    "sentences_with_entities": len([a for a in analyses if a.get('key_info_found')]),
                    "total_entities_found": aggregated_data.get('total_entities', 0),
                    "sections_identified": aggregated_data.get('sections_processed', 0),
                    "extraction_success_rate": len([a for a in analyses if a.get('key_info_found')]) / len(sentence_data_list) * 100 if sentence_data_list else 0
                }
            }

            self.document_loaded = True
            print("Enhanced document processing completed!")
            self._show_processing_summary()
            return True

        except Exception as e:
            print(f"Error processing document: {e}")
            import traceback
            traceback.print_exc()
            return False

    def _show_processing_summary(self):
        """Show enhanced processing summary"""
        print("\n" + "=" * 70)
        print("ENHANCED PROCESSING SUMMARY")
        print("=" * 70)

        metadata = self.structured_data.get("metadata", {})
        stats = self.structured_data.get("processing_statistics", {})
        section_data = self.structured_data.get("section_entities", {})

        print(f"File: {metadata.get('file_name', 'Unknown')}")
        print(f"Text length: {metadata.get('total_text_length', 0):,} characters")
        print(f"Total sentences: {stats.get('total_sentences', 0)}")
        print(f"Sentences with entities: {stats.get('sentences_with_entities', 0)}")
        print(f"Total entities found: {stats.get('total_entities_found', 0)}")
        print(f"Sections identified: {stats.get('sections_identified', 0)}")
        print(f"Success rate: {stats.get('extraction_success_rate', 0):.1f}%")

        section_stats = section_data.get('section_statistics', {})
        if section_stats:
            print(f"\nEntity Distribution by Section:")
            for section, stat in section_stats.items():
                print(f"   {section}: {stat['entities']} entities from {stat['sentences']} sentences")

        print("=" * 70)

    def save_results(self, output_path: Optional[str] = None) -> str:
        """Save enhanced processing results"""
        if not self.document_loaded:
            raise Exception("No document processed")

        if not output_path:
            file_name = self.structured_data["metadata"]["file_name"]
            pdf_name = Path(file_name).stem
            output_path = f"{pdf_name}_enhanced_analysis.json"

        output_file = Path(output_path)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.structured_data, f, indent=2, ensure_ascii=False)

        print(f"\nResults saved to: {output_file}")
        print(f"File size: {output_file.stat().st_size:,} bytes")
        return str(output_file)

    def query_document(self, question: str) -> str:
        """Query with enhanced section context"""
        if not self.document_loaded:
            return "No document processed."

        section_entities = self.structured_data.get("section_entities", {}).get("entities_by_section", {})

        context_parts = ["MEDICATION INFORMATION BY SECTION:\n"]

        for section, entities in section_entities.items():
            if any(entities.values()):
                context_parts.append(f"[{section}]")
                for entity_type, values in entities.items():
                    if values:
                        # Show more entities for better context
                        display_values = values[:5] if len(values) > 5 else values
                        context_parts.append(f"  {entity_type}: {', '.join(display_values)}")
                context_parts.append("")

        context = "\n".join(context_parts)

        query_prompt = f"""Answer about this medication based on the detailed section-organized information.

Question: {question}

Available Information:
{context[:6000]}

Provide a comprehensive answer in Portuguese, mentioning the relevant sections and specific details when appropriate. Be precise and cite the information source."""

        try:
            response = self.call_ollama_raw(query_prompt)
            return response.strip() if response else "No response received"
        except Exception as e:
            return f"Error: {e}"

    def get_entity_statistics(self) -> Dict:
        """Get detailed statistics about extracted entities"""
        if not self.document_loaded:
            return {}

        all_entities = self.structured_data.get("section_entities", {}).get("all_entities", [])

        # Count by type
        type_counts = {}
        for entity in all_entities:
            entity_type = entity.get('type', 'unknown')
            type_counts[entity_type] = type_counts.get(entity_type, 0) + 1

        # Count by confidence
        confidence_counts = {}
        for entity in all_entities:
            confidence = entity.get('confidence', 'medium')
            confidence_counts[confidence] = confidence_counts.get(confidence, 0) + 1

        # Count by section
        section_counts = {}
        for entity in all_entities:
            section = entity.get('section', 'unknown')
            section_counts[section] = section_counts.get(section, 0) + 1

        return {
            'total_entities': len(all_entities),
            'by_type': type_counts,
            'by_confidence': confidence_counts,
            'by_section': section_counts
        }

def main():
    """Demonstrate enhanced entity extraction"""
    parser = EnhancedSectionAwarePharmaParser(model_name="llama3:8b", max_retries=2)

    pdf_path = "bula_1755192077396.pdf"

    print("Enhanced Pharmaceutical Document Parser")
    print("=" * 70)

    if parser.process_document(pdf_path):
        results_file = parser.save_results()

        # Show entity statistics
        stats = parser.get_entity_statistics()
        print(f"\nENTITY EXTRACTION STATISTICS:")
        print(f"Total entities: {stats.get('total_entities', 0)}")

        print("\nBy type:")
        for entity_type, count in sorted(stats.get('by_type', {}).items()):
            print(f"  {entity_type}: {count}")

        print("\nBy confidence:")
        for confidence, count in sorted(stats.get('by_confidence', {}).items()):
            print(f"  {confidence}: {count}")

        print(f"\nTesting enhanced querying...")
        test_questions = [
            "Qual é o nome do medicamento e sua concentração?",
            "Quais são as contraindicações principais?",
            "Como deve ser administrado e qual a posologia?",
            "Quais são os efeitos adversos e sua frequência?",
            "Quem é o fabricante e como armazenar?",
            "Qual é o mecanismo de ação do medicamento?",
            "Há interações medicamentosas importantes?"
        ]

        for question in test_questions:
            print(f"\n{question}")
            answer = parser.query_document(question)
            print(f"{answer}")
            print("-" * 50)

if __name__ == "__main__":
    main()

Setting up Ollama model: llama3:8b
Ollama CLI found
Pulling model llama3:8b...
Model llama3:8b ready
Enhanced Pharmaceutical Document Parser
Processing document with enhanced extraction: bula_1755192077396.pdf
Extracting text from PDF...
Extracted 11936 characters
Splitting text with section tracking...
Section detected: primary_section - IDENTIFICAÇÃO DO MEDICAMENTO
Section detected: presentations - APRESENTAÇÕES
Section detected: caps_header - USO ORAL
Section detected: caps_header - USO ADULTO E PEDIÁTRICO ACIMA DE 6 ANOS DE IDADE
Section detected: composition - COMPOSIÇÃO
Section detected: primary_section - INFORMAÇÕES AO PACIENTE
Section detected: numbered_section - PARA QUE ESTE MEDICAMENTO É INDICADO?
Section detected: numbered_section - COMO ESTE MEDICAMENTO FUNCIONA?
Section detected: numbered_section - QUANDO NÃO DEVO USAR ESTE MEDICAMENTO?
Section detected: numbered_section - O QUE DEVO SABER ANTES DE USAR ESTE MEDICAMENTO?
Section detected: caps_header - QUE ESTE MEDICAMENT

In [None]:
#!/usr/bin/env python3
"""
Bula Entity-Relation-Value Extractor for Graph Database
Processes JSON output from bula analysis and extracts structured data for Neo4j
"""

import json
import re
from pathlib import Path
from typing import Dict, List, Tuple, Any, Set
from dataclasses import dataclass, asdict
from datetime import datetime
import uuid

@dataclass
class Entity:
    """Represents a graph entity"""
    id: str
    type: str
    properties: Dict[str, Any]
    source_sentence: str = ""
    source_section: str = ""
    confidence: str = "medium"

@dataclass
class Relation:
    """Represents a graph relationship"""
    source_entity_id: str
    target_entity_id: str
    relation_type: str
    properties: Dict[str, Any] = None
    confidence: str = "medium"

    def __post_init__(self):
        if self.properties is None:
            self.properties = {}

class BulaGraphExtractor:
    """Extracts entities, relations, and values from bula JSON for graph database"""

    def __init__(self):
        self.entities: Dict[str, Entity] = {}
        self.relations: List[Relation] = []
        self.medication_id = None

        # Entity type mappings for graph nodes
        self.entity_type_mapping = {
            'medication_name': 'Medication',
            'dosage_strength': 'DosageStrength',
            'posology': 'Posology',
            'indication': 'Indication',
            'contraindication': 'Contraindication',
            'mechanism_of_action': 'MechanismOfAction',
            'adverse_effect': 'AdverseEffect',
            'adverse_effect_frequency': 'AdverseEffectFrequency',
            'drug_interaction': 'DrugInteraction',
            'patient_population': 'PatientPopulation',
            'manufacturer': 'Manufacturer',
            'storage_condition': 'StorageCondition',
            'presentation': 'Presentation'
        }

        # Relation type mappings
        self.relation_mappings = {
            ('Medication', 'DosageStrength'): 'HAS_STRENGTH',
            ('Medication', 'Posology'): 'HAS_POSOLOGY',
            ('Medication', 'Indication'): 'INDICATED_FOR',
            ('Medication', 'Contraindication'): 'CONTRAINDICATED_FOR',
            ('Medication', 'MechanismOfAction'): 'WORKS_BY',
            ('Medication', 'AdverseEffect'): 'MAY_CAUSE',
            ('Medication', 'DrugInteraction'): 'INTERACTS_WITH',
            ('Medication', 'PatientPopulation'): 'SUITABLE_FOR',
            ('Medication', 'Manufacturer'): 'MANUFACTURED_BY',
            ('Medication', 'StorageCondition'): 'REQUIRES_STORAGE',
            ('Medication', 'Presentation'): 'AVAILABLE_AS',
            ('AdverseEffect', 'AdverseEffectFrequency'): 'HAS_FREQUENCY',
            ('Indication', 'PatientPopulation'): 'AFFECTS_POPULATION',
            ('Posology', 'PatientPopulation'): 'FOR_POPULATION'
        }

    def generate_entity_id(self, entity_type: str, value: str) -> str:
        """Generate a unique entity ID"""
        # Create deterministic ID based on type and value
        clean_value = re.sub(r'[^\w\s-]', '', value.lower())
        clean_value = re.sub(r'\s+', '_', clean_value.strip())
        return f"{entity_type.lower()}_{clean_value}_{str(uuid.uuid4())[:8]}"

    def clean_entity_value(self, value: str) -> str:
        """Clean and normalize entity values"""
        if not value or not isinstance(value, str):
            return ""

        # Remove extra whitespace and normalize
        cleaned = re.sub(r'\s+', ' ', value.strip())

        # Remove quotes
        cleaned = cleaned.strip('"\'')

        # Remove trailing periods for non-abbreviations
        if cleaned.endswith('.') and len(cleaned) > 4 and not re.match(r'.*\b[A-Z]{2,4}\.?$', cleaned):
            cleaned = cleaned.rstrip('.')

        return cleaned

    def extract_medication_entity(self, all_entities: List[Dict]) -> str:
        """Extract and create the main medication entity"""
        medication_names = [e for e in all_entities if e.get('type') == 'medication_name']

        if not medication_names:
            # Create a default medication entity
            med_id = self.generate_entity_id('medication', 'unknown_medication')
            self.entities[med_id] = Entity(
                id=med_id,
                type='Medication',
                properties={'name': 'Unknown Medication'},
                source_sentence="Extracted from document",
                source_section="Document"
            )
            return med_id

        # Use the first medication name found
        med_entity = medication_names[0]
        med_name = self.clean_entity_value(med_entity.get('value', 'Unknown'))
        med_id = self.generate_entity_id('medication', med_name)

        self.entities[med_id] = Entity(
            id=med_id,
            type='Medication',
            properties={
                'name': med_name,
                'confidence': med_entity.get('confidence', 'medium')
            },
            source_sentence=med_entity.get('source_sentence', ''),
            source_section=med_entity.get('section', '')
        )

        return med_id

    def extract_entities_from_json(self, json_data: Dict) -> None:
        """Extract entities from processed JSON data"""
        section_entities = json_data.get('section_entities', {})
        all_entities = section_entities.get('all_entities', [])

        if not all_entities:
            print("No entities found in JSON data")
            return

        # First, create the main medication entity
        self.medication_id = self.extract_medication_entity(all_entities)
        print(f"Created medication entity: {self.medication_id}")

        # Track created entities to avoid duplicates
        created_entities = set()

        # Process all other entities
        for entity_data in all_entities:
            entity_type = entity_data.get('type', '').strip()
            entity_value = self.clean_entity_value(entity_data.get('value', ''))

            if not entity_type or not entity_value:
                continue

            # Skip medication names as we already handled them
            if entity_type == 'medication_name':
                continue

            # Map entity type
            graph_entity_type = self.entity_type_mapping.get(entity_type, entity_type.title())

            # Create unique identifier for this entity
            entity_key = f"{graph_entity_type}:{entity_value}"
            if entity_key in created_entities:
                continue

            created_entities.add(entity_key)

            # Generate entity ID
            entity_id = self.generate_entity_id(graph_entity_type, entity_value)

            # Create entity
            entity = Entity(
                id=entity_id,
                type=graph_entity_type,
                properties={
                    'value': entity_value,
                    'confidence': entity_data.get('confidence', 'medium'),
                    'original_type': entity_type
                },
                source_sentence=entity_data.get('source_sentence', ''),
                source_section=entity_data.get('section', ''),
                confidence=entity_data.get('confidence', 'medium')
            )

            self.entities[entity_id] = entity

            # Create relation to medication
            relation_type = self.relation_mappings.get(
                ('Medication', graph_entity_type),
                'RELATED_TO'
            )

            relation = Relation(
                source_entity_id=self.medication_id,
                target_entity_id=entity_id,
                relation_type=relation_type,
                properties={
                    'extracted_from_section': entity_data.get('section', ''),
                    'confidence': entity_data.get('confidence', 'medium')
                }
            )

            self.relations.append(relation)

    def create_inter_entity_relations(self) -> None:
        """Create relations between non-medication entities where logical"""
        entity_list = list(self.entities.values())

        for i, entity1 in enumerate(entity_list):
            for entity2 in entity_list[i+1:]:
                # Skip self-relations and medication relations (already handled)
                if entity1.type == 'Medication' or entity2.type == 'Medication':
                    continue

                # Create specific inter-entity relations
                relation_type = None

                # Adverse effects can have frequencies
                if (entity1.type == 'AdverseEffect' and entity2.type == 'AdverseEffectFrequency'):
                    if self._are_related_by_context(entity1, entity2):
                        relation_type = 'HAS_FREQUENCY'
                elif (entity2.type == 'AdverseEffect' and entity1.type == 'AdverseEffectFrequency'):
                    if self._are_related_by_context(entity1, entity2):
                        relation_type = 'HAS_FREQUENCY'
                        entity1, entity2 = entity2, entity1  # Swap to maintain direction

                # Posology can be specific to patient populations
                elif (entity1.type == 'Posology' and entity2.type == 'PatientPopulation'):
                    if self._are_related_by_context(entity1, entity2):
                        relation_type = 'FOR_POPULATION'
                elif (entity2.type == 'Posology' and entity1.type == 'PatientPopulation'):
                    if self._are_related_by_context(entity1, entity2):
                        relation_type = 'FOR_POPULATION'
                        entity1, entity2 = entity2, entity1

                # Create relation if we found a logical connection
                if relation_type:
                    relation = Relation(
                        source_entity_id=entity1.id,
                        target_entity_id=entity2.id,
                        relation_type=relation_type,
                        properties={
                            'inferred': True,
                            'confidence': 'medium'
                        }
                    )
                    self.relations.append(relation)

    def _are_related_by_context(self, entity1: Entity, entity2: Entity) -> bool:
        """Check if two entities are related by appearing in similar contexts"""
        # Simple heuristic: same section or similar source sentences
        if entity1.source_section == entity2.source_section:
            return True

        # Check if values appear in each other's source sentences
        if (entity1.properties.get('value', '').lower() in entity2.source_sentence.lower() or
            entity2.properties.get('value', '').lower() in entity1.source_sentence.lower()):
            return True

        return False

    def extract_from_json_file(self, json_file_path: str) -> Dict[str, Any]:
        """Main extraction method from JSON file"""
        print(f"Extracting graph data from: {json_file_path}")

        # Load JSON data
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
        except Exception as e:
            raise Exception(f"Failed to load JSON file: {e}")

        # Extract entities and relations
        self.extract_entities_from_json(json_data)
        self.create_inter_entity_relations()

        # Prepare output data
        graph_data = {
            'metadata': {
                'source_file': json_file_path,
                'extraction_date': datetime.now().isoformat(),
                'total_entities': len(self.entities),
                'total_relations': len(self.relations),
                'medication_id': self.medication_id
            },
            'entities': [asdict(entity) for entity in self.entities.values()],
            'relations': [asdict(relation) for relation in self.relations],
            'statistics': self._generate_statistics()
        }

        return graph_data

    def _generate_statistics(self) -> Dict[str, Any]:
        """Generate statistics about extracted data"""
        entity_type_counts = {}
        relation_type_counts = {}
        confidence_distribution = {}

        # Count entity types
        for entity in self.entities.values():
            entity_type_counts[entity.type] = entity_type_counts.get(entity.type, 0) + 1
            confidence_distribution[entity.confidence] = confidence_distribution.get(entity.confidence, 0) + 1

        # Count relation types
        for relation in self.relations:
            relation_type_counts[relation.relation_type] = relation_type_counts.get(relation.relation_type, 0) + 1

        return {
            'entity_types': entity_type_counts,
            'relation_types': relation_type_counts,
            'confidence_distribution': confidence_distribution
        }

    def save_graph_data(self, graph_data: Dict[str, Any], output_path: str = None) -> str:
        """Save extracted graph data to JSON file"""
        if not output_path:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f"graph_data_{timestamp}.json"

        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(graph_data, f, indent=2, ensure_ascii=False)

            print(f"Graph data saved to: {output_path}")
            return output_path
        except Exception as e:
            raise Exception(f"Failed to save graph data: {e}")

    def generate_neo4j_queries(self, graph_data: Dict[str, Any]) -> List[str]:
        """Generate Neo4j Cypher queries for data insertion"""
        queries = []

        # Create unique constraints
        entity_types = set(entity['type'] for entity in graph_data['entities'])
        for entity_type in entity_types:
            queries.append(f"CREATE CONSTRAINT {entity_type.lower()}_id IF NOT EXISTS FOR (n:{entity_type}) REQUIRE n.id IS UNIQUE;")

        # Create entities
        for entity in graph_data['entities']:
            properties = entity['properties'].copy()
            properties.update({
                'source_sentence': entity.get('source_sentence', ''),
                'source_section': entity.get('source_section', ''),
                'confidence': entity.get('confidence', 'medium'),
                'created_at': 'datetime()'
            })

            # Build property string
            prop_parts = []
            for key, value in properties.items():
                if key == 'created_at':
                    prop_parts.append(f"{key}: {value}")
                else:
                    escaped_value = str(value).replace("'", "\\'")
                    prop_parts.append(f"{key}: '{escaped_value}'")

            prop_string = ', '.join(prop_parts)

            query = f"MERGE (n:{entity['type']} {{id: '{entity['id']}'}}) SET n += {{{prop_string}}};"
            queries.append(query)

        # Create relations
        for relation in graph_data['relations']:
            properties = relation.get('properties', {}).copy()
            properties['created_at'] = 'datetime()'
            properties['confidence'] = relation.get('confidence', 'medium')

            # Build property string for relation
            prop_parts = []
            for key, value in properties.items():
                if key == 'created_at':
                    prop_parts.append(f"{key}: {value}")
                else:
                    escaped_value = str(value).replace("'", "\\'")
                    prop_parts.append(f"{key}: '{escaped_value}'")

            prop_string = ', '.join(prop_parts) if prop_parts else ''
            rel_props = f" {{{prop_string}}}" if prop_string else ""

            query = f"""MATCH (a {{id: '{relation['source_entity_id']}'}}), (b {{id: '{relation['target_entity_id']}'}})
CREATE (a)-[:{relation['relation_type']}{rel_props}]->(b);"""
            queries.append(query)

        return queries

    def print_summary(self, graph_data: Dict[str, Any]) -> None:
        """Print extraction summary"""
        metadata = graph_data['metadata']
        stats = graph_data['statistics']

        print("\n" + "=" * 60)
        print("BULA GRAPH EXTRACTION SUMMARY")
        print("=" * 60)
        print(f"Source file: {metadata['source_file']}")
        print(f"Extraction date: {metadata['extraction_date']}")
        print(f"Total entities: {metadata['total_entities']}")
        print(f"Total relations: {metadata['total_relations']}")
        print(f"Main medication ID: {metadata['medication_id']}")

        print(f"\nEntity Types:")
        for entity_type, count in stats['entity_types'].items():
            print(f"  {entity_type}: {count}")

        print(f"\nRelation Types:")
        for relation_type, count in stats['relation_types'].items():
            print(f"  {relation_type}: {count}")

        print(f"\nConfidence Distribution:")
        for confidence, count in stats['confidence_distribution'].items():
            print(f"  {confidence}: {count}")
        print("=" * 60)

def main():
    """Main execution function"""
    import sys

    if len(sys.argv) != 2:
        print("Usage: python bula_graph_extractor.py <json_file_path>")
        print("Example: python bula_graph_extractor.py bula_enhanced_analysis.json")
        sys.exit(1)

    json_file_path = sys.argv[1]

    if not Path(json_file_path).exists():
        print(f"Error: File '{json_file_path}' not found")
        sys.exit(1)

    try:
        # Initialize extractor
        extractor = BulaGraphExtractor()

        # Extract graph data
        graph_data = extractor.extract_from_json_file(json_file_path)

        # Save graph data
        output_file = extractor.save_graph_data(graph_data)

        # Generate Neo4j queries
        queries = extractor.generate_neo4j_queries(graph_data)
        queries_file = output_file.replace('.json', '_neo4j_queries.cypher')
        with open(queries_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(queries))
        print(f"Neo4j queries saved to: {queries_file}")

        # Print summary
        extractor.print_summary(graph_data)

        print(f"\nFiles created:")
        print(f"  - Graph data: {output_file}")
        print(f"  - Neo4j queries: {queries_file}")

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()

Usage: python bula_graph_extractor.py <json_file_path>
Example: python bula_graph_extractor.py bula_enhanced_analysis.json


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
