In [None]:
!pip install pymupdf4llm pdfplumber pandas requests
!curl -fsSL https://ollama.com/install.sh | sh
import subprocess
import time

# Start the ollama serve process in the background
# We redirect its output to a log file to keep our notebook clean
server_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=open("ollama_server.log", "w"),
    stderr=subprocess.STDOUT
)

print("✅ Ollama server started in the background.")
print("You can now run other cells!")

# Give the server a few seconds to start up before you run other commands
time.sleep(5)
! ollama pull llama3.2:3b

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
✅ Ollama server started in the background.
You can now run other cells!
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l


trystart

In [None]:
#!/usr/bin/env python3
"""
Enhanced Pharmaceutical Knowledge Graph Extractor for Google Colab - Phrase-Based JSON Processing

Processes phrase-optimized JSON files from the enhanced pharmaceutical document parser.
Optimized for small language models with robust extraction and error handling.
Includes detailed logging of prompts and responses for debugging model performance.
"""

import json
import os
import re
import requests
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import logging
from google.colab import drive

# ==============================================================================
# CORE LOGIC: Enhanced PharmaceuticalKnowledgeExtractor CLASS
# ==============================================================================

class EnhancedPharmaceuticalKnowledgeExtractor:
    def __init__(self,
                 model_name: str = "llama3.2:3b",
                 ollama_url: str = "http://localhost:11434/api/generate",
                 max_retries: int = 3,
                 request_delay: float = 0.5):
        """
        Initialize the enhanced knowledge extractor optimized for phrase-based JSON files.
        """
        self.model_name = model_name
        self.ollama_url = ollama_url
        self.max_retries = max_retries
        self.request_delay = request_delay

        self.stats = {
            'files_processed': 0,
            'phrase_blocks_processed': 0,
            'table_blocks_processed': 0,
            'phrases_processed': 0,
            'successful_extractions': 0,
            'failed_extractions': 0,
            'total_triples': 0,
            'skipped_irrelevant': 0
        }

        # Enhanced patterns for better pharmaceutical content detection
        self.pharma_keywords = [
            'mg', 'ml', 'g/', 'mcg', 'μg', '%', 'dose', 'dosagem', 'posologia',
            'comprimido', 'cápsula', 'medicamento', 'fármaco', 'droga',
            'indicação', 'indicado', 'tratamento', 'terapia',
            'contraindicação', 'contraindicado', 'não usar', 'evitar',
            'efeito', 'reação', 'adverso', 'colateral', 'indesejável',
            'alergia', 'hipersensibilidade', 'intolerância',
            'administração', 'aplicar', 'tomar', 'ingerir',
            'composição', 'princípio ativo', 'substância', 'excipiente',
            'interação', 'interagir', 'incompatível', 'interferir',
            'gravidez', 'gestação', 'lactação', 'amamentação',
            'criança', 'pediátrico', 'adulto', 'idoso', 'geriátrico'
        ]

        self._setup_logging()
        self._test_ollama_connection()

    def _setup_logging(self):
        """Setup enhanced logging configuration."""
        # Remove existing handlers to avoid duplicates in Colab
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        # General logger for progress and errors
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('enhanced_pharma_extraction.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

        # Dedicated logger for prompts and responses
        self.prompt_logger = logging.getLogger('prompt_logger')
        self.prompt_logger.setLevel(logging.INFO)
        prompt_handler = logging.FileHandler('enhanced_prompts_and_responses.log', mode='w')
        prompt_formatter = logging.Formatter('%(message)s')
        prompt_handler.setFormatter(prompt_formatter)

        # Avoid adding handlers if they already exist
        if not self.prompt_logger.handlers:
            self.prompt_logger.addHandler(prompt_handler)

    def _test_ollama_connection(self):
        """Test connection to Ollama API with enhanced error reporting."""
        try:
            test_payload = {
                "model": self.model_name,
                "prompt": "Teste de conexão. Responda apenas 'OK'.",
                "stream": False,
                "format": "json",
                "options": {"temperature": 0.0, "num_predict": 10}
            }
            response = requests.post(self.ollama_url, json=test_payload, timeout=15)
            if response.status_code == 200:
                self.logger.info(f"✅ Successfully connected to Ollama with {self.model_name}")
                result = response.json()
                self.logger.debug(f"Test response: {result.get('response', 'No response')}")
            else:
                self.logger.warning(f"⚠️ Ollama connection test failed: {response.status_code} - {response.text}")
        except requests.exceptions.ConnectionError:
            self.logger.error("❌ Cannot connect to Ollama API. Ensure it's running and accessible from this Colab notebook.")
            raise ConnectionError("Cannot connect to Ollama API. Ensure it's running and accessible (e.g., via ngrok).")
        except Exception as e:
            self.logger.error(f"❌ Failed to connect to Ollama: {e}")
            raise ConnectionError(f"Ollama connection failed: {e}")

    def _call_ollama_api(self, prompt: str, max_tokens: int = 200) -> Optional[str]:
        """Enhanced API call with better error handling and retry logic."""
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "format": "json",
            "options": {
                "temperature": 0.0,  # Deterministic for structured output
                "top_p": 0.9,
                "top_k": 20,
                "num_predict": max_tokens,
                "stop": ["\n\n", "---", "Exemplos:", "Examples:", "Nota:", "Note:"],
                "repeat_penalty": 1.1,
                "num_ctx": 2048,  # Context window
            }
        }

        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"API call attempt {attempt + 1}/{self.max_retries}")
                response = requests.post(self.ollama_url, json=payload, timeout=120)

                if response.status_code == 200:
                    result = response.json()
                    response_text = result.get('response', '').strip()
                    if response_text:
                        return response_text
                    else:
                        self.logger.warning("Empty response from API")
                else:
                    self.logger.warning(f"API error {response.status_code}: {response.text[:200]}...")

            except requests.exceptions.Timeout:
                self.logger.warning(f"Request timeout on attempt {attempt + 1}")
            except requests.exceptions.ConnectionError:
                self.logger.warning(f"Connection error on attempt {attempt + 1}")
            except Exception as e:
                self.logger.warning(f"Request error on attempt {attempt + 1}: {e}")

            if attempt < self.max_retries - 1:
                wait_time = (2 ** attempt) + self.request_delay
                self.logger.debug(f"Waiting {wait_time:.1f}s before retry...")
                time.sleep(wait_time)

        self.logger.error("All API call attempts failed")
        return None

    def _create_enhanced_extraction_prompt(self, phrase: str, context: Dict, phrase_type: str = None) -> str:
        """Create an enhanced, more specific prompt for small language models."""
        section_info = context.get('breadcrumb', 'Seção Desconhecida')
        phrase_category = phrase_type or context.get('metadata', {}).get('phrase_type', 'geral')

        # Create more specific instructions based on phrase type
        specific_instructions = {
            'dosage_instruction': 'Foque em doses, quantidades, frequências de administração.',
            'indication': 'Extraia para que condições ou doenças o medicamento é indicado.',
            'contraindication': 'Identifique quando o medicamento NÃO deve ser usado.',
            'side_effect': 'Extraia efeitos adversos, reações indesejáveis.',
            'precaution': 'Identifique cuidados, precauções, advertências.',
            'numerical_data': 'Extraia dados numéricos relevantes (doses, concentrações).',
            'general_information': 'Extraia qualquer informação farmacêutica relevante.'
        }

        instruction = specific_instructions.get(phrase_category, specific_instructions['general_information'])

        return f"""Você é um especialista em extrair informações farmacêuticas. Analise esta frase e extraia APENAS fatos reais como triplas JSON.

CONTEXTO: {section_info}
TIPO: {phrase_category}
INSTRUÇÃO: {instruction}

FRASE: "{phrase}"

REGRAS IMPORTANTES:
1. Extraia SOMENTE informações que estão EXPLÍCITAS na frase
2. NÃO invente ou suponha informações
3. Use nomes de medicamentos exatos quando mencionados
4. Para doses, inclua unidades (mg, ml, etc.)
5. Se não há informação farmacêutica específica, retorne []

FORMATO: Array JSON de triplas [entidade, relação, valor]

EXEMPLOS DE FORMATO (NÃO COPIE O CONTEÚDO):
- [["Paracetamol", "tem_dose", "500mg"]]
- [["medicamento", "é_indicado_para", "dor de cabeça"]]
- [["substância", "pode_causar", "náusea"]]

JSON:"""

    def _parse_triples_response_enhanced(self, response: str) -> List[List[str]]:
        """Enhanced parsing with better error handling and validation."""
        if not response:
            return []

        # Clean the response
        cleaned = re.sub(r'```json\s*|```\s*', '', response.strip())
        cleaned = re.sub(r'^[^[]*', '', cleaned)  # Remove text before first [
        cleaned = re.sub(r'[^]]*$', ']', cleaned)  # Ensure ends with ]

        # Try multiple parsing strategies
        strategies = [
            self._parse_json_array,
            self._parse_regex_triples,
            self._parse_fallback_patterns
        ]

        for strategy in strategies:
            try:
                triples = strategy(cleaned)
                if triples:
                    return self._validate_and_filter_triples(triples)
            except Exception as e:
                self.logger.debug(f"Parsing strategy failed: {e}")
                continue

        self.logger.warning(f"Could not parse response: {cleaned[:100]}...")
        return []

    def _parse_json_array(self, text: str) -> List[List[str]]:
        """Parse JSON array directly."""
        # Find the JSON array pattern
        array_match = re.search(r'\[.*?\]', text, re.DOTALL)
        if array_match:
            json_str = array_match.group(0)
            parsed = json.loads(json_str)
            if isinstance(parsed, list):
                return parsed
        return []

    def _parse_regex_triples(self, text: str) -> List[List[str]]:
        """Parse using regex patterns for triple extraction."""
        patterns = [
            r'\["([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\]',  # Standard format
            r'\[\"([^\"]+)\",\s*\"([^\"]+)\",\s*\"([^\"]+)\"\]',  # Escaped quotes
            r'<([^>]+)>\s*,\s*<([^>]+)>\s*,\s*<([^>]+)>'  # Angle bracket format
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text)
            if matches:
                return [[str(x).strip() for x in match] for match in matches]
        return []

    def _parse_fallback_patterns(self, text: str) -> List[List[str]]:
        """Fallback parsing for malformed but recognizable patterns."""
        # Look for entity-relation-value patterns
        lines = text.split('\n')
        triples = []

        for line in lines:
            # Pattern: "entity" relation "value"
            pattern = r'"([^"]+)"\s+(\w+)\s+"([^"]+)"'
            match = re.search(pattern, line)
            if match:
                triples.append([match.group(1), match.group(2), match.group(3)])

        return triples

    def _validate_and_filter_triples(self, triples: List[List[str]]) -> List[List[str]]:
        """Validate and filter extracted triples for quality."""
        valid_triples = []

        # Filter out common template examples and invalid entries
        template_entities = ['medication', 'medicamento', 'paracetamol', 'substância', 'fármaco']
        template_values = ['500mg', 'comprimidos', 'dor de cabeça', 'náusea', 'exemplo']

        for triple in triples:
            if not isinstance(triple, list) or len(triple) != 3:
                continue

            entity, relation, value = [str(x).strip() for x in triple]

            # Skip if any component is empty or too short
            if not all([entity, relation, value]) or any(len(x) < 2 for x in [entity, relation, value]):
                continue

            # Skip template examples
            if (entity.lower() in template_entities and
                any(tv in value.lower() for tv in template_values)):
                continue

            # Skip placeholder patterns
            if any(x.startswith('<') and x.endswith('>') for x in [entity, relation, value]):
                continue

            # Skip overly generic relations
            generic_relations = ['é', 'tem', 'faz', 'usa']
            if relation.lower() in generic_relations and len(value) < 5:
                continue

            valid_triples.append([entity, relation, value])

        return valid_triples

    def _should_process_phrase(self, phrase: str, metadata: Dict = None) -> bool:
        """Enhanced logic to determine if a phrase should be processed."""
        if len(phrase.strip()) < 15:
            return False

        phrase_lower = phrase.lower()

        # Check for pharmaceutical keywords
        has_pharma_content = any(kw in phrase_lower for kw in self.pharma_keywords)

        # Check phrase type from metadata
        if metadata:
            phrase_type = metadata.get('phrase_type', '')
            if phrase_type in ['dosage_instruction', 'indication', 'contraindication', 'side_effect']:
                return True

        # Additional checks for numerical data that might be relevant
        has_numbers = bool(re.search(r'\d', phrase))
        has_units = bool(re.search(r'\d+\s*(mg|ml|g|%|mcg|μg)', phrase_lower))

        return has_pharma_content or has_units or (has_numbers and len(phrase) > 30)

    def _extract_phrase_knowledge(self, phrase_data: Dict) -> Dict:
        """Extract knowledge from a single phrase block with enhanced processing."""
        phrase_id = phrase_data.get('phrase_id', 'unknown')
        phrase_content = phrase_data.get('content', '').strip()
        context = phrase_data.get('context', {})
        metadata = phrase_data.get('metadata', {})

        if not self._should_process_phrase(phrase_content, metadata):
            self.stats['skipped_irrelevant'] += 1
            return {
                'phrase_id': phrase_id,
                'triples': [],
                'status': 'skipped_irrelevant'
            }

        self.logger.debug(f"Processing phrase {phrase_id}: {phrase_content[:50]}...")
        self.stats['phrases_processed'] += 1

        try:
            phrase_type = metadata.get('phrase_type')
            prompt = self._create_enhanced_extraction_prompt(phrase_content, context, phrase_type)

            # Log the interaction
            self.prompt_logger.info(f"--- START PHRASE: {phrase_id} ---")
            self.prompt_logger.info(f"PHRASE TEXT: {phrase_content}")
            self.prompt_logger.info(f"PHRASE TYPE: {phrase_type}")
            self.prompt_logger.info(f"CONTEXT: {context.get('breadcrumb', 'N/A')}")
            self.prompt_logger.info(f"PROMPT SENT:\n{prompt}")

            response = self._call_ollama_api(prompt, max_tokens=300)

            self.prompt_logger.info(f"RAW RESPONSE RECEIVED:\n{response}")
            self.prompt_logger.info(f"--- END PHRASE: {phrase_id} ---\n")

            if response:
                triples = self._parse_triples_response_enhanced(response)
                if triples:
                    self.stats['successful_extractions'] += 1
                    self.stats['total_triples'] += len(triples)
                    self.logger.debug(f"✅ Extracted {len(triples)} triples from phrase {phrase_id}")
                else:
                    self.stats['failed_extractions'] += 1

                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'phrase_type': phrase_type,
                    'context': context.get('breadcrumb'),
                    'triples': triples,
                    'status': 'success' if triples else 'no_triples_found'
                }
            else:
                self.stats['failed_extractions'] += 1
                return {
                    'phrase_id': phrase_id,
                    'phrase_text': phrase_content,
                    'triples': [],
                    'status': 'api_failed'
                }

        except Exception as e:
            self.logger.error(f"Error processing phrase {phrase_id}: {e}")
            self.stats['failed_extractions'] += 1
            return {
                'phrase_id': phrase_id,
                'phrase_text': phrase_content,
                'triples': [],
                'status': f'error: {str(e)}'
            }
        finally:
            time.sleep(self.request_delay)

    def _extract_table_knowledge(self, table_data: Dict) -> Dict:
        """Extract knowledge from table blocks with structured data handling."""
        table_id = table_data.get('table_id', 'unknown')
        content = table_data.get('content', {})
        context = table_data.get('context', {})
        metadata = table_data.get('metadata', {})

        self.logger.info(f"Processing table {table_id}")
        self.stats['table_blocks_processed'] += 1

        # Convert table to text for processing
        formatted_text = content.get('formatted_text', '')
        header = content.get('header', [])
        data_rows = content.get('data_rows', [])

        if not formatted_text and not data_rows:
            return {
                'table_id': table_id,
                'triples': [],
                'status': 'empty_table'
            }

        # Process table as structured text
        table_text = formatted_text or self._format_table_as_text(header, data_rows)

        # Use table-specific processing
        try:
            prompt = self._create_table_extraction_prompt(table_text, context, metadata)

            self.prompt_logger.info(f"--- START TABLE: {table_id} ---")
            self.prompt_logger.info(f"TABLE CONTENT:\n{table_text}")
            self.prompt_logger.info(f"PROMPT SENT:\n{prompt}")

            response = self._call_ollama_api(prompt, max_tokens=400)

            self.prompt_logger.info(f"RAW RESPONSE RECEIVED:\n{response}")
            self.prompt_logger.info(f"--- END TABLE: {table_id} ---\n")

            if response:
                triples = self._parse_triples_response_enhanced(response)
                if triples:
                    self.stats['successful_extractions'] += 1
                    self.stats['total_triples'] += len(triples)

                return {
                    'table_id': table_id,
                    'table_type': metadata.get('table_type'),
                    'context': context.get('breadcrumb'),
                    'triples': triples,
                    'status': 'success' if triples else 'no_triples_found'
                }

        except Exception as e:
            self.logger.error(f"Error processing table {table_id}: {e}")
            self.stats['failed_extractions'] += 1

        return {
            'table_id': table_id,
            'triples': [],
            'status': 'error'
        }

    def _create_table_extraction_prompt(self, table_text: str, context: Dict, metadata: Dict) -> str:
        """Create specialized prompt for table data extraction."""
        table_type = metadata.get('table_type', 'general_data')
        section_info = context.get('breadcrumb', 'Tabela')

        type_instructions = {
            'dosage_schedule': 'Extraia informações de dosagem, horários, frequências.',
            'dosage_information': 'Foque em doses, concentrações, quantidades.',
            'age_specific_data': 'Extraia dados específicos por idade ou grupo.',
            'general_data': 'Extraia qualquer informação farmacêutica estruturada.'
        }

        instruction = type_instructions.get(table_type, type_instructions['general_data'])

        return f"""Analise esta tabela farmacêutica e extraia informações estruturadas como triplas JSON.

CONTEXTO: {section_info}
TIPO DE TABELA: {table_type}
INSTRUÇÃO: {instruction}

TABELA:
{table_text}

REGRAS:
1. Extraia APENAS dados que estão na tabela
2. Para doses, mantenha unidades (mg, ml, etc.)
3. Preserve nomes de medicamentos exatos
4. Se há múltiplas linhas, extraia informação de cada linha relevante
5. Use "linha_N" ou "item_N" para distinguir entradas quando necessário

FORMATO: Array JSON de triplas [entidade, relação, valor]

JSON:"""

    def _format_table_as_text(self, header: List[str], data_rows: List[List[str]]) -> str:
        """Format table data as readable text."""
        if not data_rows:
            return ""

        lines = []
        if header:
            lines.append(" | ".join(header))
            lines.append("-" * (len(" | ".join(header))))

        for row in data_rows:
            lines.append(" | ".join(str(cell) if cell else "" for cell in row))

        return "\n".join(lines)

    def process_phrase_based_json(self, input_file: Path) -> Optional[Dict]:
        """Process a phrase-based JSON file from the enhanced parser."""
        self.logger.info(f"📄 Processing phrase-based file: {input_file.name}")

        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            self.logger.error(f"Failed to load {input_file}: {e}")
            return None

        # Get the document structure
        doc_structure = data.get('document_structure', {})
        phrase_blocks = doc_structure.get('phrase_blocks', [])
        table_blocks = doc_structure.get('table_blocks', [])

        if not phrase_blocks and not table_blocks:
            self.logger.warning(f"No phrase_blocks or table_blocks found in {input_file}")
            return None

        self.logger.info(f"Found {len(phrase_blocks)} phrase blocks and {len(table_blocks)} table blocks")

        # Process phrase blocks
        phrase_extractions = []
        for phrase_data in phrase_blocks:
            result = self._extract_phrase_knowledge(phrase_data)
            phrase_extractions.append(result)
            self.stats['phrase_blocks_processed'] += 1

        # Process table blocks
        table_extractions = []
        for table_data in table_blocks:
            result = self._extract_table_knowledge(table_data)
            table_extractions.append(result)

        # Collect all triples
        all_triples = []
        for extraction in phrase_extractions + table_extractions:
            all_triples.extend(extraction.get('triples', []))

        result = {
            'document_metadata': data.get('document_metadata', {}),
            'extraction_summary': {
                'extraction_timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'processing_method': 'enhanced_phrase_based',
                'total_phrase_blocks': len(phrase_blocks),
                'total_table_blocks': len(table_blocks),
                'total_phrases_processed': self.stats['phrases_processed'],
                'total_triples_extracted': len(all_triples),
                'successful_extractions': self.stats['successful_extractions'],
                'failed_extractions': self.stats['failed_extractions'],
                'skipped_irrelevant': self.stats['skipped_irrelevant']
            },
            'phrase_extractions': phrase_extractions,
            'table_extractions': table_extractions,
            'all_extracted_triples': all_triples,
            'metadata': data.get('metadata', {})
        }

        self.stats['files_processed'] += 1
        self.logger.info(f"✅ Completed {input_file.name}: {len(all_triples)} total triples extracted")
        return result

    def process_directory(self, input_dir: Path, output_dir: Path):
        """Process all phrase-optimized JSON files in a directory."""
        if not input_dir.is_dir():
            raise FileNotFoundError(f"Input directory not found: {input_dir}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # Look for phrase-optimized JSON files
        json_files = list(input_dir.glob('*_phrase_optimized.json'))
        if not json_files:
            self.logger.warning(f"No *_phrase_optimized.json files found in {input_dir}")
            return

        self.logger.info(f"Found {len(json_files)} phrase-optimized files to process")

        for i, json_file in enumerate(json_files):
            self.logger.info(f"\n📊 Progress: Processing file {i + 1}/{len(json_files)}")

            # Reset per-file counters
            prev_phrases = self.stats['phrases_processed']
            prev_successful = self.stats['successful_extractions']

            result = self.process_phrase_based_json(json_file)

            if result:
                output_name = json_file.stem.replace('_phrase_optimized', '_enhanced_graph_data') + '.json'
                output_file = output_dir / output_name

                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)

                # Log file-specific stats
                phrases_this_file = self.stats['phrases_processed'] - prev_phrases
                successful_this_file = self.stats['successful_extractions'] - prev_successful

                self.logger.info(f"💾 Saved results to: {output_file}")
                self.logger.info(f"📊 File stats: {phrases_this_file} phrases processed, {successful_this_file} successful extractions")

        self._generate_enhanced_report(output_dir)

    def _generate_enhanced_report(self, output_dir: Path):
        """Generate comprehensive final report."""
        report = {
            'summary': {
                'timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'processing_method': 'enhanced_phrase_based',
                'total_files_processed': self.stats['files_processed'],
            },
            'detailed_statistics': self.stats,
            'performance_metrics': {
                'success_rate': (
                    self.stats['successful_extractions'] /
                    max(self.stats['phrases_processed'], 1) * 100
                ),
                'avg_triples_per_successful_extraction': (
                    self.stats['total_triples'] /
                    max(self.stats['successful_extractions'], 1)
                ),
                'processing_efficiency': {
                    'phrases_processed': self.stats['phrases_processed'],
                    'relevant_phrases': self.stats['phrases_processed'] - self.stats['skipped_irrelevant'],
                    'relevance_rate': (
                        (self.stats['phrases_processed'] - self.stats['skipped_irrelevant']) /
                        max(self.stats['phrases_processed'], 1) * 100
                    )
                }
            }
        }

        report_file = output_dir / 'enhanced_final_extraction_report.json'
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)

        self.logger.info(f"📊 Enhanced final report saved: {report_file}")
        self._print_summary_stats()

    def _print_summary_stats(self):
        """Print summary statistics to console."""
        print("\n" + "=" * 70)
        print("🎯 EXTRACTION SUMMARY")
        print("=" * 70)
        print(f"📁 Files processed: {self.stats['files_processed']}")
        print(f"🧩 Phrase blocks processed: {self.stats['phrase_blocks_processed']}")
        print(f"📋 Table blocks processed: {self.stats['table_blocks_processed']}")
        print(f"✨ Total phrases analyzed: {self.stats['phrases_processed']}")
        print(f"✅ Successful extractions: {self.stats['successful_extractions']}")
        print(f"❌ Failed extractions: {self.stats['failed_extractions']}")
        print(f"⏭️ Skipped irrelevant: {self.stats['skipped_irrelevant']}")
        print(f"🔗 Total triples extracted: {self.stats['total_triples']}")

        if self.stats['phrases_processed'] > 0:
            success_rate = (self.stats['successful_extractions'] / self.stats['phrases_processed']) * 100
            print(f"📈 Success rate: {success_rate:.1f}%")

        print("=" * 70)


# ==============================================================================
# 🚀 MAIN EXECUTION SECTION
# ==============================================================================

# --- CONFIGURATION ---
# IMPORTANT: Change these paths to match your Google Drive folders.
DRIVE_INPUT_DIR = "processed_pdfs"  # Folder with *_phrase_optimized.json files
DRIVE_OUTPUT_DIR = "enhanced_graph_data"  # Where enhanced results will be saved
OLLAMA_MODEL = "llama3.2:3b"        # The Ollama model you are running
REQUEST_DELAY = 0.5                 # Seconds to wait between API calls
MAX_RETRIES = 3                     # Number of times to retry a failed API call

def main():
    """Main execution function with enhanced error handling and logging."""
    print("🚀 Starting Enhanced Pharmaceutical Knowledge Graph Extractor")
    print("🧩 Optimized for phrase-based JSON processing")
    print("=" * 70)

    try:
        # Mount Google Drive
        print("🔧 Mounting Google Drive...")
        drive.mount('/content/drive', force_remount=True)
        print("✅ Google Drive mounted successfully.")

        # Define paths
        drive_base_path = Path('/content/drive/MyDrive/')
        full_input_path = drive_base_path / DRIVE_INPUT_DIR
        full_output_path = drive_base_path / DRIVE_OUTPUT_DIR

        print(f"📁 Input Directory: {full_input_path}")
        print(f"📂 Output Directory: {full_output_path}")
        print(f"🤖 Model: {OLLAMA_MODEL}")
        print(f"🧩 Processing Method: Enhanced phrase-by-phrase")
        print(f"⏱️  Request Delay: {REQUEST_DELAY}s")
        print("=" * 70)

        # Validate input directory exists
        if not full_input_path.exists():
            print(f"❌ ERROR: Input directory not found at '{full_input_path}'")
            print("Please ensure you have run the phrase-based document parser first.")
            print("The input directory should contain *_phrase_optimized.json files.")
            return

        # Check for expected files
        json_files = list(full_input_path.glob('*_phrase_optimized.json'))
        if not json_files:
            print(f"⚠️  WARNING: No *_phrase_optimized.json files found in {full_input_path}")
            print("Please ensure you have run the enhanced document parser to generate phrase-based JSON files.")

            # Show what files are actually there
            all_files = list(full_input_path.glob('*.json'))
            if all_files:
                print(f"Found {len(all_files)} JSON files:")
                for f in all_files[:5]:  # Show first 5
                    print(f"  - {f.name}")
                if len(all_files) > 5:
                    print(f"  ... and {len(all_files) - 5} more")
            return

        print(f"🔍 Found {len(json_files)} phrase-optimized files to process:")
        for f in json_files:
            print(f"  - {f.name}")
        print()

        # Initialize the enhanced extractor
        print("🤖 Initializing enhanced knowledge extractor...")
        extractor = EnhancedPharmaceuticalKnowledgeExtractor(
            model_name=OLLAMA_MODEL,
            max_retries=MAX_RETRIES,
            request_delay=REQUEST_DELAY
        )

        # Process all files
        print("🚀 Starting batch processing...")
        extractor.process_directory(
            input_dir=full_input_path,
            output_dir=full_output_path
        )

        print("\n🎉 All files processed successfully!")
        print(f"📂 Results saved to: {full_output_path}")
        print("📊 Check the enhanced_final_extraction_report.json for detailed statistics.")

    except FileNotFoundError as e:
        print(f"❌ ERROR: Directory or file not found.")
        print(f"Details: {e}")
        print("\nTroubleshooting:")
        print("1. Ensure Google Drive is properly mounted")
        print("2. Check that the input directory path is correct")
        print("3. Verify that phrase-optimized JSON files exist")

    except ConnectionError as e:
        print(f"❌ ERROR: Could not connect to the Ollama server.")
        print("Please ensure your local Ollama instance is running and accessible.")
        print("If using Google Colab, you may need to use ngrok to tunnel the connection.")
        print(f"Details: {e}")

    except KeyboardInterrupt:
        print(f"\n⚠️  Processing interrupted by user.")
        print("Partial results may have been saved.")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        import traceback
        print("Full error traceback:")
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
Pharmaceutical Knowledge Graph Extractor for Google Colab

Processes JSON files from a specified Google Drive directory to extract knowledge triples.
Optimized for Llama 3.2 3B model limitations with smart batching and focused prompts.
"""

import json
import os
import re
import requests
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import logging
from google.colab import drive

# ==============================================================================
# CORE LOGIC: PharmaceuticalKnowledgeExtractor CLASS
# ==============================================================================

class PharmaceuticalKnowledgeExtractor:
    def __init__(self,
                 model_name: str = "llama3.2:3b",
                 ollama_url: str = "http://localhost:11434/api/generate",
                 max_retries: int = 3,
                 request_delay: float = 0.5):
        """
        Initialize the knowledge extractor with Llama 3.2 3B optimizations.
        """
        self.model_name = model_name
        self.ollama_url = ollama_url
        self.max_retries = max_retries
        self.request_delay = request_delay

        self.stats = {
            'files_processed': 0, 'blocks_processed': 0,
            'successful_extractions': 0, 'failed_extractions': 0,
            'total_triples': 0
        }

        self._setup_logging()
        self._test_ollama_connection()

    def _setup_logging(self):
        """Setup logging configuration."""
        # Remove any existing handlers to avoid duplicate logs in Colab
        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('pharma_extraction.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def _test_ollama_connection(self):
        """Test connection to Ollama API."""
        try:
            test_payload = {
                "model": self.model_name, "prompt": "Test connection",
                "stream": False, "format": "json"
            }
            response = requests.post(self.ollama_url, json=test_payload, timeout=15)
            if response.status_code == 200:
                self.logger.info(f"✅ Successfully connected to Ollama with {self.model_name}")
            else:
                self.logger.warning(f"⚠️ Ollama connection test failed: {response.status_code}")
        except Exception as e:
            self.logger.error(f"❌ Failed to connect to Ollama: {e}")
            raise ConnectionError("Cannot connect to Ollama API. Ensure it's running and accessible from this Colab notebook (e.g., via ngrok).")

    def _call_ollama_api(self, prompt: str, max_tokens: int = 500) -> Optional[str]:
        """Call Ollama API with retry logic and error handling."""
        payload = {
            "model": self.model_name, "prompt": prompt,
            "stream": False, "format": "json",
            "options": {
                "temperature": 0.1, "top_p": 0.9,
                "num_predict": max_tokens,
                "stop": ["\n\n", "---"],
            }
        }

        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"API call attempt {attempt + 1}/{self.max_retries}")
                response = requests.post(self.ollama_url, json=payload, timeout=120)

                if response.status_code == 200:
                    result = response.json()
                    return result.get('response', '').strip()
                else:
                    self.logger.warning(f"API error {response.status_code}: {response.text}")

            except requests.exceptions.Timeout:
                self.logger.warning(f"Request timeout on attempt {attempt + 1}")
            except Exception as e:
                self.logger.warning(f"Request error on attempt {attempt + 1}: {e}")

            if attempt < self.max_retries - 1:
                wait_time = (2 ** attempt) + self.request_delay
                time.sleep(wait_time)

        self.logger.error("All API call attempts failed")
        return None

    def _create_focused_extraction_prompt(self, content: str, context: Dict) -> str:
        """Create a focused prompt based on section keywords."""
        content = content[:800].strip() + "..." if len(content) > 800 else content.strip()
        section = context.get('breadcrumb', 'Unknown Section')

        focus_map = {
            "composition": ['composição', 'composition'],
            "dosage": ['dosagem', 'posologia', 'como usar'],
            "indication": ['indicação', 'indication', 'para que'],
            "contraindication": ['contraindicação', 'contraindication', 'não devo usar'],
            "side_effect": ['efeito', 'reação', 'adverse', 'males'],
            "interaction": ['interação', 'interaction']
        }

        focus_type = "general"
        for f_type, keywords in focus_map.items():
            if any(kw in section.lower() for kw in keywords):
                focus_type = f_type
                break

        return f"""Extract pharmaceutical facts as JSON triples from this text.

Section: {section}
Focus: {focus_type}

Text: "{content}"

Extract ONLY factual triples in format [entity, relation, value]. Return a valid JSON array. No explanations.

Example:
[["Amoxicilina", "has_dosage", "500mg 3x ao dia"], ["Amoxicilina", "treats", "infecções respiratórias"]]

JSON:"""

    def _create_comprehensive_prompt(self, content: str, context: Dict) -> str:
        """Create a comprehensive prompt for important sections."""
        content = content[:600].strip() + "..." if len(content) > 600 else content.strip()
        section = context.get('breadcrumb', 'Unknown Section')

        return f"""Extract all pharmaceutical information from this text as knowledge triples.

Section: {section}
Content: "{content}"

Extract triples for medication names, dosages, conditions (indications/contraindications), side effects, and interactions.
Format: [["entity", "relation", "value"], ...]
Return only a valid JSON array:"""

    def _parse_triples_response(self, response: str) -> List[List[str]]:
        """Parse the API response to robustly extract a list of triples."""
        if not response:
            return []

        cleaned = re.sub(r'```json\s*|```\s*', '', response.strip())

        try:
            start_idx = cleaned.find('[')
            end_idx = cleaned.rfind(']')
            if start_idx == -1 or end_idx == -1: return []

            json_str = cleaned[start_idx:end_idx + 1]
            parsed = json.loads(json_str)

            valid_triples = []
            if isinstance(parsed, list):
                for item in parsed:
                    if isinstance(item, list) and len(item) == 3:
                        entity, relation, value = [str(x).strip() for x in item]
                        if all([entity, relation, value]):
                            valid_triples.append([entity, relation, value])
            return valid_triples

        except json.JSONDecodeError as e:
            self.logger.warning(f"JSON parsing failed: {e}. Falling back to regex.")
            pattern = r'\["([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\]'
            matches = re.findall(pattern, cleaned)
            return [[m[0], m[1], m[2]] for m in matches]

    def _should_process_block(self, block: Dict) -> bool:
        """Determine if a block has relevant content worth processing."""
        content = block.get('content', '').strip()
        if len(content) < 25: return False

        pharma_keywords = [
            'mg', 'ml', 'dose', 'comprimido', 'cápsula', 'medicamento', 'indicação',
            'contraindicação', 'efeito', 'reação', 'alergia', 'administração',
            'posologia', 'composição', 'princípio ativo'
        ]

        return any(kw in content.lower() for kw in pharma_keywords) or len(content) > 150

    def _extract_block_knowledge(self, block: Dict, block_index: int) -> Dict:
        """Extract knowledge from a single block."""
        block_id = block.get('id', f'block_{block_index}')
        if not self._should_process_block(block):
            return {
                'block_id': block_id, 'triples': [], 'status': 'skipped_irrelevant'
            }

        content = block.get('content', '').strip()
        context = block.get('context', {})
        self.logger.info(f"Processing block {block_id}: {content[:60]}...")

        try:
            if len(content) > 500:
                prompt = self._create_comprehensive_prompt(content, context)
            else:
                prompt = self._create_focused_extraction_prompt(content, context)

            response = self._call_ollama_api(prompt)

            if response:
                triples = self._parse_triples_response(response)
                self.stats['successful_extractions'] += 1
                self.stats['total_triples'] += len(triples)
                self.logger.info(f"✅ Extracted {len(triples)} triples from {block_id}")
                return {
                    'block_id': block_id, 'block_type': block.get('type'),
                    'breadcrumb': context.get('breadcrumb'), 'triples': triples, 'status': 'success'
                }
            else:
                self.stats['failed_extractions'] += 1
                return {
                    'block_id': block_id, 'triples': [], 'status': 'api_failed'
                }
        except Exception as e:
            self.logger.error(f"Error in block {block_id}: {e}")
            self.stats['failed_extractions'] += 1
            return {
                'block_id': block_id, 'triples': [], 'status': f'error: {str(e)}'
            }
        finally:
            time.sleep(self.request_delay)

    def process_json_file(self, input_file: Path) -> Optional[Dict]:
        """Process a single JSON file."""
        self.logger.info(f"📄 Processing file: {input_file.name}")
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            self.logger.error(f"Failed to load {input_file}: {e}")
            return None

        blocks = data.get('representations', {}).get('flat_blocks', [])
        if not blocks:
            self.logger.warning(f"No flat_blocks found in {input_file}")
            return None

        extractions = [self._extract_block_knowledge(block, i) for i, block in enumerate(blocks)]

        total_triples = sum(len(e['triples']) for e in extractions)
        result = {
            'document_metadata': data.get('document_metadata', {}),
            'extraction_summary': {
                'extraction_timestamp': datetime.now().isoformat(),
                'total_triples_extracted': total_triples
            },
            'graph_extractions': extractions
        }

        self.stats['files_processed'] += 1
        self.logger.info(f"✅ Completed {input_file.name}: {total_triples} triples extracted.")
        return result

    def process_directory(self, input_dir: Path, output_dir: Path):
        """Process all relevant JSON files in a directory."""
        if not input_dir.is_dir():
            raise FileNotFoundError(f"Input directory not found: {input_dir}")
        output_dir.mkdir(parents=True, exist_ok=True)

        json_files = list(input_dir.glob('*_llm_optimized.json'))
        if not json_files:
            self.logger.warning(f"No *_llm_optimized.json files found in {input_dir}")
            return

        self.logger.info(f"Found {len(json_files)} files to process.")

        for i, json_file in enumerate(json_files):
            self.logger.info(f"\n📊 Progress: Processing file {i + 1}/{len(json_files)}")
            result = self.process_json_file(json_file)

            if result:
                output_name = json_file.stem.replace('_llm_optimized', '_graph_data') + '.json'
                output_file = output_dir / output_name
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)
                self.logger.info(f"💾 Saved results to: {output_file}")

        self._generate_final_report(output_dir)

    def _generate_final_report(self, output_dir: Path):
        """Generate and save a final summary report."""
        report = {
            'summary': {
                'timestamp': datetime.now().isoformat(),
                'model_used': self.model_name,
                'total_files_processed': self.stats['files_processed'],
            },
            'statistics': self.stats,
        }
        report_file = output_dir / 'final_extraction_report.json'
        with open(report_file, 'w') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        self.logger.info(f"📊 Final report saved: {report_file}")


# ==============================================================================
# 🚀 RUN THE EXTRACTION PROCESS
# ==============================================================================

# --- 1. CONFIGURATION ---
# IMPORTANT: Change these paths to match your Google Drive folders.
DRIVE_INPUT_DIR = "processed_pdfs"  # The folder inside 'My Drive' containing your JSONs
DRIVE_OUTPUT_DIR = "graph_data"     # The folder inside 'My Drive' where results will be saved
OLLAMA_MODEL = "llama3.2:3b"        # The Ollama model you are running
REQUEST_DELAY = 0.5                 # Seconds to wait between API calls
MAX_RETRIES = 3                     # Number of times to retry a failed API call

# --- 2. SETUP AND EXECUTION ---
print("🚀 Starting Pharmaceutical Knowledge Graph Extractor")
print("=" * 60)

try:
    # Mount Google Drive
    print("🔧 Mounting Google Drive...")
    drive.mount('/content/drive', force_remount=True)
    print("✅ Google Drive mounted successfully.")

    # Define full paths
    drive_base_path = Path('/content/drive/MyDrive/')
    full_input_path = drive_base_path / DRIVE_INPUT_DIR
    full_output_path = drive_base_path / DRIVE_OUTPUT_DIR

    print(f"Input path: {full_input_path}")
    print(f"Output path: {full_output_path}")
    print(f"Model: {OLLAMA_MODEL}")
    print("=" * 60)

    # Initialize and run the extractor
    extractor = PharmaceuticalKnowledgeExtractor(
        model_name=OLLAMA_MODEL,
        max_retries=MAX_RETRIES,
        request_delay=REQUEST_DELAY
    )

    extractor.process_directory(
        input_dir=full_input_path,
        output_dir=full_output_path
    )

    print("\n🎉 All files processed successfully!")

except FileNotFoundError as e:
    print(f"❌ ERROR: A directory was not found. Please check your paths.")
    print(f"Details: {e}")
except ConnectionError as e:
    print(f"❌ ERROR: Could not connect to the Ollama server.")
    print("Please ensure your local Ollama instance is running and accessible (e.g., via ngrok).")
    print(f"Details: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

🚀 Starting Pharmaceutical Knowledge Graph Extractor
🔧 Mounting Google Drive...


2025-09-01 04:10:23,244 - ERROR - ❌ Failed to connect to Ollama: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x79997957f2c0>: Failed to establish a new connection: [Errno 111] Connection refused'))


Mounted at /content/drive
✅ Google Drive mounted successfully.
Input path: /content/drive/MyDrive/processed_pdfs
Output path: /content/drive/MyDrive/graph_data
Model: llama3.2:3b
❌ ERROR: Could not connect to the Ollama server.
Please ensure your local Ollama instance is running and accessible (e.g., via ngrok).
Details: Cannot connect to Ollama API. Ensure it's running and accessible from this Colab notebook (e.g., via ngrok).


start

In [None]:
# Colab cell: send EXACT user prompt to local Ollama model (no extra prompt added)
# Paste into Colab/Jupyter and run.
# Requires ollama CLI available in the environment and model pulled (e.g. !ollama pull llama3.2:3b)
# If ollama is not available in Colab, run this in a local notebook where ollama is installed.

import subprocess, shlex
from IPython.display import display, HTML
import ipywidgets as widgets

MODEL = "llama3.2:3b"  # change if you want another local model ref
history = []  # stores tuples (user_prompt, assistant_output) — only used if "Include history" is checked

# Widgets
prompt_box = widgets.Textarea(
    placeholder="Paste/type the exact prompt you want the model to see (I will NOT modify it).",
    layout=widgets.Layout(width='100%', height='120px')
)
send_btn = widgets.Button(description="Send → model", button_style="primary")
include_history_chk = widgets.Checkbox(value=False, description="Include history (send previous turns)", indent=False)
temp_input = widgets.Text(value="", placeholder="leave empty for default", description="--temperature")
num_predict_input = widgets.Text(value="", placeholder="leave empty for default", description="--num-predict")
clear_hist_btn = widgets.Button(description="Clear history", button_style="warning")
output_area = widgets.Output(layout=widgets.Layout(border='1px solid lightgray'))

def call_ollama_raw(prompt, extra_flags=""):
    """Call ollama run MODEL with prompt as stdin. Returns stdout or stderr."""
    cmd = ["ollama", "run", MODEL]
    if extra_flags:
        # split safely
        cmd += shlex.split(extra_flags)
    try:
        proc = subprocess.run(cmd, input=prompt, text=True, capture_output=True, timeout=120)
    except FileNotFoundError:
        raise FileNotFoundError("`ollama` CLI not found in this environment.")
    except subprocess.TimeoutExpired:
        raise RuntimeError("ollama call timed out.")
    out = proc.stdout.strip()
    if not out:
        out = proc.stderr.strip()
    return out

def on_send(b):
    user_prompt = prompt_box.value
    if not user_prompt.strip():
        with output_area:
            print("Please paste or type the prompt you want to send to the model.")
        return

    # If include-history is checked, build a concatenation of previous turns + current prompt.
    if include_history_chk.value and history:
        # simple concatenation: previous_user \n previous_assistant \n ... then current prompt
        parts = []
        for u,a in history:
            parts.append(u)
            parts.append(a)
        parts.append(user_prompt)
        prompt_to_send = "\n".join(parts)
    else:
        # send exactly user's prompt (no modifications)
        prompt_to_send = user_prompt

    # Build extra flags string if provided
    extra_flags = ""
    if temp_input.value.strip():
        extra_flags += f" --temperature {temp_input.value.strip()}"
    if num_predict_input.value.strip():
        extra_flags += f" --num-predict {num_predict_input.value.strip()}"

    with output_area:
        print("=== Sending EXACT prompt to model ===")
        print(prompt_to_send if len(prompt_to_send) < 3000 else prompt_to_send[:3000] + "\n... (truncated preview)\n")
        print("=== Model response (raw) ===")
    try:
        response = call_ollama_raw(prompt_to_send, extra_flags=extra_flags)
    except FileNotFoundError:
        with output_area:
            print("\nERROR: `ollama` CLI not found in this environment.")
            print("Run this notebook where ollama is installed (local machine) or provide a remote endpoint.")
        return
    except Exception as e:
        with output_area:
            print("\nERROR calling ollama:", str(e))
        return

    with output_area:
        print(response)
        print("\n--- (end response) ---\n")

    # Save to history (so user can choose to include it next time). We store exactly what was sent and received.
    history.append((prompt_to_send, response))

    # keep history bounded (optional)
    if len(history) > 40:
        history[:] = history[-40:]

def on_clear_history(b):
    history.clear()
    with output_area:
        print("History cleared.")

send_btn.on_click(on_send)
clear_hist_btn.on_click(on_clear_history)

controls = widgets.HBox([send_btn, clear_hist_btn, include_history_chk])
flags_row = widgets.HBox([temp_input, num_predict_input])

display(HTML("<h4>Send EXACT prompt to raw llama3.2:3b (via ollama)</h4>"))
display(HTML("<i>I will not modify your prompt. Paste/type it below and press 'Send → model'.</i>"))
display(prompt_box, controls, flags_row, output_area)

with output_area:
    print("Ready. Paste your prompt above and press 'Send → model'.\n"
          "If you want the model to see previous turns, check 'Include history' (history is stored only after you send a prompt).")


Textarea(value='', layout=Layout(height='120px', width='100%'), placeholder='Paste/type the exact prompt you w…

HBox(children=(Button(button_style='primary', description='Send → model', style=ButtonStyle()), Button(button_…

HBox(children=(Text(value='', description='--temperature', placeholder='leave empty for default'), Text(value=…

Output(layout=Layout(border='1px solid lightgray'))

entity extraction

In [None]:
! pip install pymupdf4llm pdfplumber



In [None]:
#!/usr/bin/env python3
"""
Automated Pharmaceutical Document Parser with Ollama Integration
Automatically processes documents and extracts structured information using local LLM
"""

import json
import re
import subprocess
import shlex
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import pymupdf4llm
import pdfplumber

class AutomatedPharmaParser:
    def __init__(self, model_name: str = "llama3.2:3b"):
        """
        Initialize automated parser with Ollama integration
        """
        self.model_name = model_name
        self.raw_content = ""
        self.structured_data = {}
        self.document_loaded = False
        self.setup_ollama()

    def setup_ollama(self):
        """Setup Ollama model automatically"""
        print(f"Setting up Ollama model: {self.model_name}")

        try:
            # Check if ollama is available
            subprocess.run(["ollama", "--version"], capture_output=True, check=True)
            print("✅ Ollama CLI found")
        except (FileNotFoundError, subprocess.CalledProcessError):
            raise RuntimeError("❌ Ollama CLI not found. Please install Ollama first.")

        try:
            # Pull model if not available
            print(f"Pulling model {self.model_name}...")
            result = subprocess.run(
                ["ollama", "pull", self.model_name],
                capture_output=True,
                text=True,
                timeout=300
            )
            if result.returncode == 0:
                print(f"✅ Model {self.model_name} ready")
            else:
                print(f"⚠️ Pull result: {result.stderr}")
        except subprocess.TimeoutExpired:
            print("⚠️ Model pull timed out, but model might already be available")
        except Exception as e:
            print(f"⚠️ Error pulling model: {e}")

    def call_ollama_raw(self, prompt: str, extra_flags: str = "") -> str:
        """Call ollama with exact prompt - no modifications"""
        cmd = ["ollama", "run", self.model_name]
        if extra_flags:
            cmd += shlex.split(extra_flags)

        try:
            proc = subprocess.run(
                cmd,
                input=prompt,
                text=True,
                capture_output=True,
                timeout=120
            )
            output = proc.stdout.strip()
            if not output:
                output = proc.stderr.strip()
            return output
        except subprocess.TimeoutExpired:
            raise RuntimeError("Ollama call timed out")
        except Exception as e:
            raise RuntimeError(f"Error calling ollama: {e}")

    def extract_pdf_content(self, pdf_path: str) -> str:
        """Extract text content from PDF"""
        try:
            # Try pdfplumber first
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text.append(page_text)

                if all_text:
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"pdfplumber failed: {e}")

        try:
            # Fallback to pymupdf4llm
            return pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            print(f"pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def create_entity_extraction_prompt(self, text: str, chunk_size: int = 3000) -> List[str]:
        """Create prompts for entity extraction from pharmaceutical text"""

        # Base prompt for pharmaceutical entity extraction
        base_prompt = """System: You are a parser. For each Text below, extract entities, relation, value triples as a JSON array.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences. Output must be parseable by json.loads().

Format:
[
  {"entity": "...", "relation": "...", "value": "..."},
  ...
]

Focus on pharmaceutical information:
- Medication names and active ingredients
- Dosages, concentrations, and administration routes
- Indications, contraindications, and side effects
- Age groups, patient populations
- Storage conditions and expiration
- Manufacturer information

Text: """

        # Split text into chunks if too long
        text_chunks = []
        if len(text) <= chunk_size:
            text_chunks.append(text)
        else:
            words = text.split()
            current_chunk = []
            current_length = 0

            for word in words:
                if current_length + len(word) + 1 > chunk_size:
                    if current_chunk:
                        text_chunks.append(" ".join(current_chunk))
                        current_chunk = []
                        current_length = 0

                current_chunk.append(word)
                current_length += len(word) + 1

            if current_chunk:
                text_chunks.append(" ".join(current_chunk))

        # Create prompts for each chunk
        prompts = []
        for i, chunk in enumerate(text_chunks):
            prompt = f"{base_prompt}{chunk}"
            prompts.append(prompt)

        return prompts

    def create_structure_analysis_prompt(self, text: str) -> str:
        """Create prompt for document structure analysis"""

        structure_prompt = f"""System: You are a pharmaceutical document analyzer. Analyze the document structure and create a JSON summary.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences.

Format:
{{
  "document_type": "...",
  "main_sections": [
    {{
      "section_number": "...",
      "section_title": "...",
      "content_type": "...",
      "key_points": ["...", "..."]
    }}
  ],
  "medication_info": {{
    "name": "...",
    "active_ingredient": "...",
    "forms": ["...", "..."],
    "concentrations": ["...", "..."]
  }},
  "critical_information": {{
    "contraindications": ["...", "..."],
    "serious_warnings": ["...", "..."],
    "storage_conditions": "..."
  }}
}}

Text: {text[:4000]}"""

        return structure_prompt

    def parse_json_response(self, response: str) -> Any:
        """Parse JSON response, handling common formatting issues"""
        # Clean up response
        cleaned = response.strip()

        # Remove code fences if present
        if cleaned.startswith("```"):
            lines = cleaned.split('\n')
            cleaned = '\n'.join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])

        # Remove any leading/trailing text that's not JSON
        start_idx = cleaned.find('[') if cleaned.find('[') != -1 else cleaned.find('{')
        end_idx = cleaned.rfind(']') if cleaned.rfind(']') != -1 else cleaned.rfind('}')

        if start_idx != -1 and end_idx != -1:
            cleaned = cleaned[start_idx:end_idx+1]

        try:
            return json.loads(cleaned)
        except json.JSONDecodeError as e:
            print(f"JSON parse error: {e}")
            print(f"Problematic text: {cleaned[:200]}...")
            return None

    def extract_entities_from_document(self, text: str) -> List[Dict]:
        """Extract entities from entire document"""
        print("🔍 Extracting entities using LLM...")

        prompts = self.create_entity_extraction_prompt(text)
        all_entities = []

        for i, prompt in enumerate(prompts):
            print(f"Processing chunk {i+1}/{len(prompts)}...")

            try:
                response = self.call_ollama_raw(prompt)
                entities = self.parse_json_response(response)

                if entities and isinstance(entities, list):
                    all_entities.extend(entities)
                    print(f"  Extracted {len(entities)} entities from chunk {i+1}")
                else:
                    print(f"  No valid entities from chunk {i+1}")

            except Exception as e:
                print(f"  Error processing chunk {i+1}: {e}")
                continue

        # Remove duplicates
        unique_entities = []
        seen = set()
        for entity in all_entities:
            key = (entity.get('entity', ''), entity.get('relation', ''), entity.get('value', ''))
            if key not in seen:
                seen.add(key)
                unique_entities.append(entity)

        print(f"✅ Total unique entities extracted: {len(unique_entities)}")
        return unique_entities

    def analyze_document_structure(self, text: str) -> Dict:
        """Analyze document structure using LLM"""
        print("📋 Analyzing document structure using LLM...")

        prompt = self.create_structure_analysis_prompt(text)

        try:
            response = self.call_ollama_raw(prompt)
            structure = self.parse_json_response(response)

            if structure and isinstance(structure, dict):
                print("✅ Document structure analyzed successfully")
                return structure
            else:
                print("⚠️ Could not parse structure analysis")
                return {}

        except Exception as e:
            print(f"❌ Error analyzing structure: {e}")
            return {}

    def create_summary_prompt(self, entities: List[Dict], structure: Dict) -> str:
        """Create prompt for generating document summary"""

        entities_text = json.dumps(entities[:50], indent=2)  # Limit to first 50 entities
        structure_text = json.dumps(structure, indent=2)

        summary_prompt = f"""System: You are a pharmaceutical document summarizer. Based on the extracted entities and document structure, create a comprehensive summary.
Only output valid JSON. DO NOT include any extra text, commentary, or code fences.

Format:
{{
  "executive_summary": "...",
  "medication_details": {{
    "name": "...",
    "active_ingredients": ["...", "..."],
    "therapeutic_class": "...",
    "indications": ["...", "..."],
    "dosage_forms": ["...", "..."],
    "key_dosages": ["...", "..."]
  }},
  "safety_information": {{
    "contraindications": ["...", "..."],
    "warnings": ["...", "..."],
    "common_side_effects": ["...", "..."],
    "serious_reactions": ["...", "..."]
  }},
  "administration_info": {{
    "routes": ["...", "..."],
    "dosing_schedule": "...",
    "special_populations": {{
      "pediatric": "...",
      "geriatric": "...",
      "renal_impairment": "...",
      "hepatic_impairment": "..."
    }}
  }},
  "storage_and_handling": "...",
  "manufacturer": "..."
}}

Extracted Entities:
{entities_text}

Document Structure:
{structure_text}"""

        return summary_prompt

    def generate_comprehensive_summary(self, entities: List[Dict], structure: Dict) -> Dict:
        """Generate comprehensive summary using LLM"""
        print("📝 Generating comprehensive summary...")

        prompt = self.create_summary_prompt(entities, structure)

        try:
            response = self.call_ollama_raw(prompt, extra_flags="--temperature 0.1")
            summary = self.parse_json_response(response)

            if summary and isinstance(summary, dict):
                print("✅ Summary generated successfully")
                return summary
            else:
                print("⚠️ Could not parse summary")
                return {}

        except Exception as e:
            print(f"❌ Error generating summary: {e}")
            return {}

    def process_document(self, pdf_path: str) -> bool:
        """Fully automated document processing"""
        print(f"📄 Processing document: {pdf_path}")

        if not Path(pdf_path).exists():
            print(f"❌ File not found: {pdf_path}")
            return False

        try:
            # Extract text
            print("📖 Extracting text from PDF...")
            self.raw_content = self.extract_pdf_content(pdf_path)

            if not self.raw_content:
                print("❌ No text content extracted")
                return False

            print(f"✅ Extracted {len(self.raw_content)} characters")

            # Analyze structure
            structure = self.analyze_document_structure(self.raw_content)

            # Extract entities
            entities = self.extract_entities_from_document(self.raw_content)

            # Generate summary
            summary = self.generate_comprehensive_summary(entities, structure)

            # Compile final structure
            self.structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "total_text_length": len(self.raw_content),
                    "total_entities": len(entities),
                    "model_used": self.model_name
                },
                "document_structure": structure,
                "extracted_entities": entities,
                "comprehensive_summary": summary,
                "processing_statistics": {
                    "entities_by_type": self._count_entities_by_type(entities),
                    "structure_sections": len(structure.get('main_sections', [])),
                    "processing_method": "automated_llm_analysis"
                }
            }

            self.document_loaded = True
            print("✅ Document processing completed!")
            self._show_processing_summary()
            return True

        except Exception as e:
            print(f"❌ Error processing document: {e}")
            return False

    def _count_entities_by_type(self, entities: List[Dict]) -> Dict[str, int]:
        """Count entities by relation type"""
        counts = {}
        for entity in entities:
            relation = entity.get('relation', 'unknown')
            counts[relation] = counts.get(relation, 0) + 1
        return counts

    def _show_processing_summary(self):
        """Show processing summary"""
        print("\n" + "=" * 70)
        print("📊 PROCESSING SUMMARY")
        print("=" * 70)

        metadata = self.structured_data.get("metadata", {})
        stats = self.structured_data.get("processing_statistics", {})

        print(f"📁 File: {metadata.get('file_name', 'Unknown')}")
        print(f"🤖 Model: {metadata.get('model_used', 'Unknown')}")
        print(f"📝 Text length: {metadata.get('total_text_length', 0):,} characters")
        print(f"🏷️ Total entities: {metadata.get('total_entities', 0)}")
        print(f"📋 Structure sections: {stats.get('structure_sections', 0)}")

        print("\n🏷️ Entity distribution:")
        for entity_type, count in stats.get('entities_by_type', {}).items():
            print(f"   {entity_type}: {count}")

        # Show sample entities
        entities = self.structured_data.get("extracted_entities", [])
        if entities:
            print("\n📝 Sample entities:")
            for i, entity in enumerate(entities[:5]):
                print(f"   {i+1}. {entity.get('entity', 'N/A')} -> {entity.get('relation', 'N/A')} -> {entity.get('value', 'N/A')}")

        print("=" * 70)

    def save_results(self, output_path: Optional[str] = None) -> str:
        """Save processing results"""
        if not self.document_loaded:
            raise Exception("No document processed")

        if not output_path:
            file_name = self.structured_data["metadata"]["file_name"]
            pdf_name = Path(file_name).stem
            output_path = f"{pdf_name}_automated_analysis.json"

        output_file = Path(output_path)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.structured_data, f, indent=2, ensure_ascii=False)

        print(f"\n📄 Results saved to: {output_file}")
        print(f"📊 File size: {output_file.stat().st_size:,} bytes")
        return str(output_file)

    def query_document(self, question: str) -> str:
        """Query the processed document"""
        if not self.document_loaded:
            return "❌ No document processed. Please process a document first."

        # Create context from processed data
        context_parts = []

        # Add summary
        summary = self.structured_data.get("comprehensive_summary", {})
        if summary:
            context_parts.append("DOCUMENT SUMMARY:")
            context_parts.append(json.dumps(summary, indent=2))

        # Add relevant entities (simple keyword matching)
        entities = self.structured_data.get("extracted_entities", [])
        question_words = question.lower().split()
        relevant_entities = []

        for entity in entities:
            entity_text = f"{entity.get('entity', '')} {entity.get('relation', '')} {entity.get('value', '')}".lower()
            if any(word in entity_text for word in question_words):
                relevant_entities.append(entity)

        if relevant_entities:
            context_parts.append("\nRELEVANT ENTITIES:")
            context_parts.append(json.dumps(relevant_entities[:10], indent=2))

        context = "\n".join(context_parts)

        # Create query prompt
        query_prompt = f"""System: You are a pharmaceutical document assistant. Answer the question based on the provided document context.
Be precise and cite specific information when possible.

Question: {question}

Document Context:
{context[:6000]}

Answer:"""

        try:
            response = self.call_ollama_raw(query_prompt, extra_flags="--temperature 0.1")
            return response.strip()
        except Exception as e:
            return f"❌ Error processing query: {e}"

def main():
    """Demonstrate automated processing"""
    parser = AutomatedPharmaParser(model_name="llama3.2:3b")

    # Use the PDF from the documents
    pdf_path = "bula_1755196887789.pdf"

    print("🤖 Automated Pharmaceutical Document Parser")
    print("=" * 70)

    if parser.process_document(pdf_path):
        # Save results
        results_file = parser.save_results()

        # Test querying
        print(f"\n🔍 Testing automated querying...")
        test_questions = [
            "Qual é a dosagem recomendada?",
            "Quais são as contraindicações?",
            "Como deve ser armazenado o medicamento?"
        ]

        for question in test_questions:
            print(f"\nQ: {question}")
            answer = parser.query_document(question)
            print(f"A: {answer[:200]}...")

if __name__ == "__main__":
    main()

Setting up Ollama model: llama3.2:3b
✅ Ollama CLI found
Pulling model llama3.2:3b...
✅ Model llama3.2:3b ready
🎯 Sentence-Based Pharmaceutical Document Parser
📄 Processing document with sentence-based analysis: bula_1755192077396.pdf
📖 Extracting text from PDF...
✅ Extracted 11936 characters
✂️ Splitting text into sentences with abbreviation safeguards...
✅ Identified 86 sentences

📝 Sample sentences identified:
  1. ezetimiba Sandoz do Brasil Ind. Farm. Ltda. Bula do Paciente comprimido 10 mg I) IDENTIFICAÇÃO DO MEDICAMENTO ezetimiba Medicamento genérico, Lei nº 9.787, de 1999 APRESENTAÇÕES ezetimiba comprimido 10 mg. Embalagem contendo 30 ou 60 comprimidos.
  2. USO ORAL USO ADULTO E PEDIÁTRICO ACIMA DE 6 ANOS DE IDADE COMPOSIÇÃO Cada comprimido de 10 mg contém: ezetimiba.....................................................
  3. 10 mg excipientes q.s.p. .................
🔍 Analyzing 86 sentences...
Processing sentence 1/86: ezetimiba Sandoz do Brasil Ind. Farm. Ltda. Bula do Paciente

neeewww

In [None]:
#!/usr/bin/env python3
"""
Section-Aware Pharmaceutical Document Parser
Processes documents sentence by sentence while tracking document sections/headers
"""

import json
import re
import subprocess
import shlex
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import pymupdf4llm
import pdfplumber

class SectionAwarePharmaParser:
    def __init__(self, model_name: str = "llama3:8b"):
        """
        Initialize section-aware parser with header tracking
        """
        self.model_name = model_name
        self.raw_content = ""
        self.structured_data = {}
        self.document_loaded = False

        # Common pharmaceutical abbreviations that should NOT end sentences
        self.pharma_abbreviations = {
            'mg', 'ml', 'mcg', 'kg', 'g', 'l', 'dl', 'mmol', 'mol',  # Units
            'q.s.p', 'c.q.s', 'q.s', 'c.s.p',  # Pharmaceutical Latin
            'ltda', 'ltd', 'inc', 'corp', 'sa', 'co',  # Company abbreviations
            'dr', 'dra', 'prof', 'sr', 'sra',  # Titles
            'etc', 'ex', 'vs', 'e.g', 'i.e',  # Common abbreviations
            'cnpj', 'cpf', 'rg', 'crf', 'crm',  # Brazilian document types
            'anvisa', 'ms', 'rdc', 'vp', 'vps',  # Brazilian regulatory
            'd.d', 'p.ex', 'n°', 'nº'  # Other common abbreviations
        }

        # Brazilian pharmaceutical document section patterns
        self.section_patterns = {
            # Primary numbered sections
            r'^\s*I+\)\s*(.+)$': 'primary_section',  # I), II), III)
            r'^\s*\d+\.\s*(.+)$': 'numbered_section',  # 1., 2., 3.

            # Common pharmaceutical sections
            r'^\s*(IDENTIFICAÇÃO|IDENTIFICACAO)\s*(DO\s*MEDICAMENTO)?\s*$': 'identification',
            r'^\s*(INFORMAÇÕES|INFORMACOES)\s*(AO\s*PACIENTE)?\s*$': 'patient_info',
            r'^\s*(COMPOSIÇÃO|COMPOSICAO)\s*$': 'composition',
            r'^\s*(APRESENTAÇÕES|APRESENTACOES)\s*$': 'presentations',
            r'^\s*(INDICAÇÕES|INDICACOES)\s*$': 'indications',
            r'^\s*(CONTRAINDICAÇÕES|CONTRAINDICACOES)\s*$': 'contraindications',
            r'^\s*(PRECAUÇÕES|PRECAUCOES)\s*$': 'precautions',
            r'^\s*(REAÇÕES\s*ADVERSAS|REACOES\s*ADVERSAS|EFEITOS\s*ADVERSOS)\s*$': 'adverse_effects',
            r'^\s*(INTERAÇÕES|INTERACOES)\s*(MEDICAMENTOSAS)?\s*$': 'drug_interactions',
            r'^\s*(POSOLOGIA|DOSAGEM)\s*$': 'dosage',
            r'^\s*(SUPERDOSAGEM|SUPERDOSE)\s*$': 'overdose',
            r'^\s*ARMAZENAMENTO\s*$': 'storage',
            r'^\s*DIZERES\s*LEGAIS\s*$': 'legal_info',

            # Question-style headers
            r'^\s*\d+\.\s*(PARA\s*QUE|O\s*QUE|COMO|QUANDO|ONDE|QUAIS)\s*.*\?\s*$': 'question_header'
        }

        self.setup_ollama()

    def setup_ollama(self):
        """Setup Ollama model automatically"""
        print(f"Setting up Ollama model: {self.model_name}")
        try:
            subprocess.run(["ollama", "--version"], capture_output=True, check=True)
            print("✅ Ollama CLI found")
        except (FileNotFoundError, subprocess.CalledProcessError):
            raise RuntimeError("❌ Ollama CLI not found. Please install Ollama first.")

        try:
            print(f"Pulling model {self.model_name}...")
            result = subprocess.run(
                ["ollama", "pull", self.model_name],
                capture_output=True, text=True, timeout=300
            )
            if result.returncode == 0:
                print(f"✅ Model {self.model_name} ready")
        except Exception as e:
            print(f"⚠️ Error with model setup: {e}")

    def call_ollama_raw(self, prompt: str, extra_flags: str = "") -> str:
        """Call ollama with exact prompt"""
        cmd = ["ollama", "run", self.model_name]
        if extra_flags:
            cmd += shlex.split(extra_flags)

        try:
            proc = subprocess.run(
                cmd, input=prompt, text=True, capture_output=True, timeout=60
            )
            return proc.stdout.strip() or proc.stderr.strip()
        except subprocess.TimeoutExpired:
            raise RuntimeError("Ollama call timed out")
        except Exception as e:
            raise RuntimeError(f"Error calling ollama: {e}")

    def extract_pdf_content(self, pdf_path: str) -> str:
        """Extract text content from PDF"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text.append(page_text)
                if all_text:
                    return "\n\n".join(all_text)
        except Exception as e:
            print(f"pdfplumber failed: {e}")

        try:
            return pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            print(f"pymupdf4llm failed: {e}")
            raise Exception("All extraction methods failed")

    def detect_section_header(self, text: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Detect if text is a section header and return (section_type, section_title)
        """
        text_clean = text.strip()

        # Skip very short lines
        if len(text_clean) < 3:
            return None, None

        # Check against section patterns
        for pattern, section_type in self.section_patterns.items():
            match = re.match(pattern, text_clean, re.IGNORECASE)
            if match:
                if section_type == 'primary_section' or section_type == 'numbered_section':
                    section_title = match.group(1).strip()
                else:
                    section_title = text_clean
                return section_type, section_title

        # Check for all-caps headers (common in pharmaceutical docs)
        if (text_clean.isupper() and
            len(text_clean) > 5 and
            len(text_clean) < 100 and
            not re.search(r'\d{2,}', text_clean)):  # Not just numbers
            return 'caps_header', text_clean

        return None, None

    def is_likely_abbreviation(self, text: str) -> bool:
        """Check if text ending with period is likely an abbreviation"""
        if not text or len(text) < 2:
            return False

        word = text.rstrip('.').lower()

        if word in self.pharma_abbreviations:
            return True

        patterns = [
            r'^[a-z]{1,4}$',  # Short lowercase words
            r'^[A-Z]{2,6}$',  # All caps short words
            r'^[A-Z][a-z]{1,3}$',  # Capitalized short words
            r'^\d+[a-z]+$',  # Numbers with letters
            r'^[a-z]\.[a-z]',  # Pattern like q.s.p
            r'[0-9]$'  # Ends with number
        ]

        for pattern in patterns:
            if re.match(pattern, word):
                return True

        return False

    def smart_sentence_split_with_sections(self, text: str) -> List[Dict]:
        """
        Split text into sentences with section awareness
        Returns list of dicts with sentence and section info
        """
        print("📋 Splitting text with section tracking...")

        # First split by lines to identify headers
        lines = text.split('\n')

        current_section_type = 'unknown'
        current_section_title = 'Document Start'
        sentence_data = []
        current_sentence = ""

        for line_num, line in enumerate(lines):
            line = line.strip()

            if not line:  # Skip empty lines
                continue

            # Check if this line is a section header
            section_type, section_title = self.detect_section_header(line)

            if section_type and section_title:
                # This is a header - finish current sentence if any
                if current_sentence.strip():
                    sentences = self._split_sentence_safely(current_sentence)
                    for sent in sentences:
                        if sent.strip() and len(sent.strip()) > 10:
                            sentence_data.append({
                                'sentence': sent.strip(),
                                'section_type': current_section_type,
                                'section_title': current_section_title,
                                'line_number': line_num,
                                'is_header': False
                            })
                    current_sentence = ""

                # Update current section
                current_section_type = section_type
                current_section_title = section_title

                # Add header as special sentence
                sentence_data.append({
                    'sentence': line,
                    'section_type': section_type,
                    'section_title': section_title,
                    'line_number': line_num,
                    'is_header': True
                })

                print(f"📍 Section detected: {section_type} - {section_title}")

            else:
                # Regular content line - add to current sentence
                if current_sentence:
                    current_sentence += " " + line
                else:
                    current_sentence = line

        # Process any remaining sentence
        if current_sentence.strip():
            sentences = self._split_sentence_safely(current_sentence)
            for sent in sentences:
                if sent.strip() and len(sent.strip()) > 10:
                    sentence_data.append({
                        'sentence': sent.strip(),
                        'section_type': current_section_type,
                        'section_title': current_section_title,
                        'line_number': len(lines),
                        'is_header': False
                    })

        # Filter out headers from regular processing
        content_sentences = [s for s in sentence_data if not s['is_header']]

        print(f"✅ Found {len(sentence_data)} total items ({len(content_sentences)} content sentences)")
        print(f"📊 Sections identified: {len(set(s['section_title'] for s in sentence_data))}")

        return content_sentences

    def _split_sentence_safely(self, text: str) -> List[str]:
        """Split text into sentences with abbreviation awareness"""
        sentences = []
        current_sentence = ""

        # Split by potential sentence endings
        parts = re.split(r'([.!?]+)', text)

        i = 0
        while i < len(parts):
            if i % 2 == 0:  # Text part
                current_sentence += parts[i]
            else:  # Punctuation part
                punctuation = parts[i]
                current_sentence += punctuation

                if '.' in punctuation:
                    words = current_sentence.split()
                    if words:
                        last_word = words[-1]
                        if not self.is_likely_abbreviation(last_word):
                            if current_sentence.strip():
                                sentences.append(current_sentence.strip())
                            current_sentence = ""
                    else:
                        if current_sentence.strip():
                            sentences.append(current_sentence.strip())
                        current_sentence = ""
                else:
                    # ! or ? - definitely sentence endings
                    if current_sentence.strip():
                        sentences.append(current_sentence.strip())
                    current_sentence = ""
            i += 1

        # Add any remaining sentence
        if current_sentence.strip():
            sentences.append(current_sentence.strip())

        return [s for s in sentences if s.strip() and len(s.strip()) > 10]

    def create_section_aware_prompt(self, sentence_data: Dict) -> str:
        """Create a prompt that includes section context"""
        sentence = sentence_data['sentence']
        section_type = sentence_data['section_type']
        section_title = sentence_data['section_title']

        prompt = f"""Analyze this sentence from a Brazilian pharmaceutical document. The sentence comes from the "{section_title}" section.

RESPOND ONLY WITH VALID JSON. No explanations, no markdown.

Context: This sentence is from the {section_type} section titled "{section_title}".

Extract relevant pharmaceutical information considering the section context.

Format:
{{
  "entities": [
    {{"type": "medication_name", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "dosage", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "indication", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "contraindication", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "side_effect", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "manufacturer", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "storage", "value": "...", "confidence": "high|medium|low"}},
    {{"type": "administration", "value": "...", "confidence": "high|medium|low"}}
  ],
  "section_relevance": "high|medium|low",
  "key_info_found": true/false
}}

Sentence: "{sentence}"

JSON:"""

        return prompt

    def parse_json_response(self, response: str) -> Any:
        """Enhanced JSON parsing with better error handling"""
        if not response or not response.strip():
            return None

        cleaned = response.strip()

        # Remove markdown code blocks
        if "```json" in cleaned:
            start = cleaned.find("```json") + 7
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()
        elif "```" in cleaned:
            start = cleaned.find("```") + 3
            end = cleaned.rfind("```")
            if start < end:
                cleaned = cleaned[start:end].strip()

        # Find JSON boundaries
        json_start = cleaned.find('{')
        json_end = cleaned.rfind('}') + 1

        if json_start != -1 and json_end > json_start:
            cleaned = cleaned[json_start:json_end]

        try:
            return json.loads(cleaned)
        except json.JSONDecodeError as e:
            try:
                # Fix common issues
                fixed = re.sub(r',(\s*[}\]])', r'\1', cleaned)
                fixed = re.sub(r'(?<!\\)"(?=[^,}\]]*[,}\]])', r'\\"', fixed)
                return json.loads(fixed)
            except:
                print(f"JSON parse error: {cleaned[:100]}...")
                return None

    def analyze_sentence_with_context(self, sentence_data: Dict, index: int) -> Dict:
        """Analyze a sentence with section context"""
        try:
            prompt = self.create_section_aware_prompt(sentence_data)
            response = self.call_ollama_raw(prompt)

            result = self.parse_json_response(response)

            if result and isinstance(result, dict):
                # Add metadata with null checks
                result['sentence_index'] = index
                result['original_sentence'] = sentence_data.get('sentence', '')
                result['section_type'] = sentence_data.get('section_type', 'unknown')
                result['section_title'] = sentence_data.get('section_title', 'Unknown Section')
                result['line_number'] = sentence_data.get('line_number', 0)
                return result
            else:
                return self._create_empty_result(sentence_data, index, 'parsing_error')

        except Exception as e:
            print(f"Error analyzing sentence {index}: {e}")
            return self._create_empty_result(sentence_data, index, f'analysis_error: {e}')

    def _create_empty_result(self, sentence_data: Dict, index: int, error_type: str = None) -> Dict:
        """Create empty result structure with safe defaults"""
        return {
            'entities': [],
            'section_relevance': 'low',
            'key_info_found': False,
            'sentence_index': index,
            'original_sentence': sentence_data.get('sentence', ''),
            'section_type': sentence_data.get('section_type', 'unknown'),
            'section_title': sentence_data.get('section_title', 'Unknown Section'),
            'line_number': sentence_data.get('line_number', 0),
            'error': error_type
        }

    def process_sentences_with_context(self, sentence_data_list: List[Dict]) -> List[Dict]:
        """Process all sentences with section context"""
        print(f"🔍 Analyzing {len(sentence_data_list)} sentences with section context...")

        analyses = []
        successful_analyses = 0

        for i, sentence_data in enumerate(sentence_data_list):
            sentence = sentence_data.get('sentence', '')[:60]
            section = sentence_data.get('section_title', 'Unknown')

            print(f"Processing {i+1}/{len(sentence_data_list)} in [{section}]: {sentence}...")

            analysis = self.analyze_sentence_with_context(sentence_data, i)
            analyses.append(analysis)

            if analysis.get('key_info_found') and not analysis.get('error'):
                successful_analyses += 1
                entity_count = len(analysis.get('entities', []))
                if entity_count > 0:
                    print(f"  ✅ Found {entity_count} entities")

            import time
            time.sleep(0.1)

        print(f"✅ Completed analysis: {successful_analyses}/{len(sentence_data_list)} sentences with entities")
        return analyses

    def aggregate_entities_by_section(self, analyses: List[Dict]) -> Dict:
        """Aggregate entities by section with null-safe processing"""
        print("📊 Aggregating entities by section...")

        section_entities = {}
        all_entities = []
        section_stats = {}

        for analysis in analyses:
            if not analysis or analysis.get('error'):
                continue

            section_title = analysis.get('section_title', 'Unknown Section')

            # Initialize section if not exists
            if section_title not in section_entities:
                section_entities[section_title] = {
                    'medication_names': set(), 'dosages': set(), 'indications': set(),
                    'contraindications': set(), 'side_effects': set(), 'manufacturers': set(),
                    'storage_conditions': set(), 'administration_info': set()
                }
                section_stats[section_title] = {'sentences': 0, 'entities': 0}

            section_stats[section_title]['sentences'] += 1

            entities = analysis.get('entities', [])
            if not entities:
                continue

            section_stats[section_title]['entities'] += len(entities)

            for entity in entities:
                # --- START: FIX ---
                # 1. Check if the entity is a valid dictionary
                if not isinstance(entity, dict):
                    continue

                # 2. Get the type and value, which could be None
                entity_type = entity.get('type')
                entity_value = entity.get('value')

                # 3. Ensure both type and value are not None or empty before stripping
                if not entity_type or not entity_value:
                    continue

                entity_type = entity_type.strip()
                entity_value = str(entity_value).strip() # Convert to string to be safe
                # --- END: FIX ---

                type_mapping = {
                    'medication_name': 'medication_names', 'dosage': 'dosages',
                    'indication': 'indications', 'contraindication': 'contraindications',
                    'side_effect': 'side_effects', 'manufacturer': 'manufacturers',
                    'storage': 'storage_conditions', 'administration': 'administration_info'
                }

                if entity_type in type_mapping:
                    collection_key = type_mapping[entity_type]
                    section_entities[section_title][collection_key].add(entity_value)

                    all_entities.append({
                        'type': entity_type, 'value': entity_value,
                        'section': section_title,
                        'confidence': entity.get('confidence', 'medium'),
                        'sentence_index': analysis.get('sentence_index', -1)
                    })

        # Convert sets to lists for JSON serialization
        for section in section_entities:
            for entity_type in section_entities[section]:
                section_entities[section][entity_type] = list(section_entities[section][entity_type])

        result = {
            'entities_by_section': section_entities, 'all_entities': all_entities,
            'section_statistics': section_stats, 'total_entities': len(all_entities),
            'sections_processed': len(section_entities)
        }

        print(f"✅ Aggregated {len(all_entities)} entities across {len(section_entities)} sections")
        return result

    def process_document(self, pdf_path: str) -> bool:
        """Main document processing with section awareness"""
        print(f"📄 Processing document with section-aware analysis: {pdf_path}")

        if not Path(pdf_path).exists():
            print(f"❌ File not found: {pdf_path}")
            return False

        try:
            # Extract text
            print("📖 Extracting text from PDF...")
            self.raw_content = self.extract_pdf_content(pdf_path)

            if not self.raw_content:
                print("❌ No text content extracted")
                return False

            print(f"✅ Extracted {len(self.raw_content)} characters")

            # Split with section awareness
            sentence_data_list = self.smart_sentence_split_with_sections(self.raw_content)

            # Process sentences with context
            analyses = self.process_sentences_with_context(sentence_data_list)

            # Aggregate by sections
            aggregated_data = self.aggregate_entities_by_section(analyses)

            # Compile results
            self.structured_data = {
                "metadata": {
                    "file_path": pdf_path,
                    "file_name": Path(pdf_path).name,
                    "processing_date": datetime.now().isoformat(),
                    "total_text_length": len(self.raw_content),
                    "model_used": self.model_name,
                    "extraction_method": "section_aware_sentence_analysis"
                },
                "sentence_analyses": analyses,
                "section_entities": aggregated_data,
                "processing_statistics": {
                    "total_sentences": len(sentence_data_list),
                    "sentences_with_entities": len([a for a in analyses if a.get('key_info_found')]),
                    "total_entities_found": aggregated_data.get('total_entities', 0),
                    "sections_identified": aggregated_data.get('sections_processed', 0)
                }
            }

            self.document_loaded = True
            print("✅ Section-aware document processing completed!")
            self._show_processing_summary()
            return True

        except Exception as e:
            print(f"❌ Error processing document: {e}")
            import traceback
            traceback.print_exc()
            return False

    def _show_processing_summary(self):
        """Show processing summary with section information"""
        print("\n" + "=" * 70)
        print("📊 SECTION-AWARE PROCESSING SUMMARY")
        print("=" * 70)

        metadata = self.structured_data.get("metadata", {})
        stats = self.structured_data.get("processing_statistics", {})
        section_data = self.structured_data.get("section_entities", {})

        print(f"📁 File: {metadata.get('file_name', 'Unknown')}")
        print(f"📝 Text length: {metadata.get('total_text_length', 0):,} characters")
        print(f"✂️ Total sentences: {stats.get('total_sentences', 0)}")
        print(f"🏷️ Sentences with entities: {stats.get('sentences_with_entities', 0)}")
        print(f"🔢 Total entities found: {stats.get('total_entities_found', 0)}")
        print(f"📋 Sections identified: {stats.get('sections_identified', 0)}")

        # Show section statistics
        section_stats = section_data.get('section_statistics', {})
        if section_stats:
            print(f"\n📊 Entity Distribution by Section:")
            for section, stat in section_stats.items():
                print(f"   {section}: {stat['entities']} entities from {stat['sentences']} sentences")

        print("=" * 70)

    def save_results(self, output_path: Optional[str] = None) -> str:
        """Save processing results"""
        if not self.document_loaded:
            raise Exception("No document processed")

        if not output_path:
            file_name = self.structured_data["metadata"]["file_name"]
            pdf_name = Path(file_name).stem
            output_path = f"{pdf_name}_section_aware_analysis.json"

        output_file = Path(output_path)
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(self.structured_data, f, indent=2, ensure_ascii=False)

        print(f"\n📄 Results saved to: {output_file}")
        print(f"📊 File size: {output_file.stat().st_size:,} bytes")
        return str(output_file)

    def query_document(self, question: str) -> str:
        """Query with section context"""
        if not self.document_loaded:
            return "❌ No document processed."

        section_entities = self.structured_data.get("section_entities", {}).get("entities_by_section", {})

        context_parts = ["MEDICATION INFORMATION BY SECTION:\n"]

        for section, entities in section_entities.items():
            if any(entities.values()):  # Only show sections with entities
                context_parts.append(f"[{section}]")
                for entity_type, values in entities.items():
                    if values:
                        context_parts.append(f"  {entity_type}: {', '.join(values[:3])}")
                context_parts.append("")

        context = "\n".join(context_parts)

        query_prompt = f"""Answer about this medication based on the section-organized information.

Question: {question}

Available Information:
{context[:4000]}

Provide a clear answer in Portuguese, mentioning the relevant sections when appropriate."""

        try:
            response = self.call_ollama_raw(query_prompt)
            return response.strip() if response else "No response received"
        except Exception as e:
            return f"❌ Error: {e}"

def main():
    """Demonstrate section-aware processing"""
    parser = SectionAwarePharmaParser(model_name="llama3:8b")

    pdf_path = "bula_1755192077396.pdf"

    print("🎯 Section-Aware Pharmaceutical Document Parser")
    print("=" * 70)

    if parser.process_document(pdf_path):
        results_file = parser.save_results()

        print(f"\n🔍 Testing section-aware querying...")
        test_questions = [
            "Qual é o nome do medicamento e sua concentração?",
            "Quais são as contraindicações principais?",
            "Como deve ser administrado?",
            "Quais são os efeitos adversos mais comuns?",
            "Quem é o fabricante?"
        ]

        for question in test_questions:
            print(f"\n❓ {question}")
            answer = parser.query_document(question)
            print(f"💡 {answer}")
            print("-" * 50)

if __name__ == "__main__":
    main()

Setting up Ollama model: llama3.2:3b
✅ Ollama CLI found
Pulling model llama3.2:3b...
✅ Model llama3.2:3b ready
🎯 Section-Aware Pharmaceutical Document Parser
📄 Processing document with section-aware analysis: bula_1755192077396.pdf
📖 Extracting text from PDF...
✅ Extracted 11936 characters
📋 Splitting text with section tracking...
📍 Section detected: primary_section - IDENTIFICAÇÃO DO MEDICAMENTO
📍 Section detected: presentations - APRESENTAÇÕES
📍 Section detected: caps_header - USO ORAL
📍 Section detected: caps_header - USO ADULTO E PEDIÁTRICO ACIMA DE 6 ANOS DE IDADE
📍 Section detected: composition - COMPOSIÇÃO
📍 Section detected: primary_section - INFORMAÇÕES AO PACIENTE
📍 Section detected: numbered_section - PARA QUE ESTE MEDICAMENTO É INDICADO?
📍 Section detected: numbered_section - COMO ESTE MEDICAMENTO FUNCIONA?
📍 Section detected: numbered_section - QUANDO NÃO DEVO USAR ESTE MEDICAMENTO?
📍 Section detected: numbered_section - O QUE DEVO SABER ANTES DE USAR ESTE MEDICAMENTO?
📍 S