In [2]:
pip install docling
pip install langchain_community
pip install chromadb

Collecting docling
  Downloading docling-2.43.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.44.1-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling)
  Downloading docling_parse-4.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docling-ibm-models<4,>=3.9.0 (from docling)
  Downloading docling_ibm_models-3.9.0-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic-settings<3.0.0,>=2.3.0 (from doclin

In [7]:
# -*- coding: utf-8 -*-
"""
Complete Improved Policy Q&A System with Enhanced Answer Generation
Fixed version that provides complete, descriptive answers
"""

import time
import pandas as pd
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# NLTK setup with fallbacks
import nltk
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('averaged_perceptron_tagger_eng', quiet=True)
    print("✅ NLTK resources downloaded successfully")
except Exception as e:
    print(f"⚠️ NLTK download warning: {e}")

# Core imports
from docling.document_converter import DocumentConverter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, T5ForConditionalGeneration
from langchain.schema import Document
import torch

class PolicyStructureAnalyzer:
    """Analyzes insurance policy document structure and categorizes content"""

    def __init__(self):
        self.policy_patterns = {
            'facilities_covered': {
                'patterns': [r'coverage', r'benefits?', r'covered\s+under', r'hospitalization', r'treatment'],
                'keywords': ['coverage', 'benefits', 'covered', 'hospitalization', 'treatment', 'medical', 'surgery']
            },
            'exclusions': {
                'patterns': [r'exclusions?', r'not\s+covered', r'exceptions?', r'limitations?', r'excluded'],
                'keywords': ['exclusions', 'excluded', 'not covered', 'exceptions', 'limitations', 'restrictions']
            },
            'waiting_periods': {
                'patterns': [r'waiting\s+period', r'months?\s+waiting', r'continuous\s+coverage', r'pre.existing'],
                'keywords': ['waiting', 'period', 'months', 'continuous', 'pre-existing', 'diseases']
            },
            'claim_procedures': {
                'patterns': [r'claim\s+procedure', r'how\s+to\s+claim', r'filing\s+claim', r'cashless', r'reimbursement'],
                'keywords': ['claim', 'procedure', 'cashless', 'reimbursement', 'settlement', 'documents']
            },
            'sum_insured_limits': {
                'patterns': [r'sum\s+insured', r'maximum\s+liability', r'policy\s+limit', r'coverage\s+limit'],
                'keywords': ['sum', 'insured', 'maximum', 'liability', 'limit', 'amount', 'rupees']
            },
            'copayment_deductibles': {
                'patterns': [r'co.payment', r'copayment', r'deductible', r'out\s+of\s+pocket'],
                'keywords': ['copayment', 'co-payment', 'deductible', 'pocket', 'percentage']
            }
        }

    def analyze_content_type(self, text):
        """Analyze text to determine policy-specific category"""
        text_lower = text.lower()
        scores = {}

        for category, patterns_info in self.policy_patterns.items():
            score = 0
            for pattern in patterns_info['patterns']:
                matches = len(re.findall(pattern, text_lower))
                score += matches * 3
            for keyword in patterns_info['keywords']:
                if keyword in text_lower:
                    score += 1
            scores[category] = score

        if scores:
            best_category = max(scores, key=scores.get)
            if scores[best_category] > 0:
                return best_category
        return 'general_information'

class PolicyAwareChunker:
    """Creates policy-specific chunks with intelligent categorization"""

    def __init__(self, chunk_size=800, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.analyzer = PolicyStructureAnalyzer()

        try:
            self.stop_words = set(nltk.corpus.stopwords.words('english'))
        except:
            self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'])

        # Insurance-specific stop words to ignore
        self.insurance_stopwords = {
            'national', 'insurance', 'company', 'limited', 'ltd', 'premises',
            'head', 'office', 'page', 'arogya', 'sanjeevani', 'kolkata',
            'bbox', 'coord', 'topleft', 'row_span', 'col_span'
        }
        self.stop_words.update(self.insurance_stopwords)

    def clean_text(self, text):
        """Clean text while preserving policy structure"""
        # Remove table metadata and formatting artifacts
        text = re.sub(r"'bbox':\s*\{[^}]+\}", "", text)
        text = re.sub(r"'coord_origin':\s*<[^>]+>", "", text)
        text = re.sub(r"['\[\]{}]", "", text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def create_policy_aware_chunks(self, text):
        """Create chunks with policy-specific categorization"""
        print("🔍 Creating policy-categorized chunks...")

        # Clean text while preserving structure
        cleaned_text = self.clean_text(text)

        # Simple but effective chunking approach
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", ". ", " "]
        )

        chunks = splitter.split_text(cleaned_text)
        enhanced_chunks = []

        for i, chunk in enumerate(chunks):
            if len(chunk.strip()) > 100:  # Only meaningful chunks
                category = self.analyzer.analyze_content_type(chunk)
                enhanced_chunks.append(Document(
                    page_content=chunk.strip(),
                    metadata={
                        'chunk_id': i,
                        'policy_category': category,
                        'context_summary': f"{category.replace('_', ' ')}",
                        'keywords_str': '',
                        'priority': 5,
                        'chunk_length': len(chunk)
                    }
                ))

        print(f"✅ Created {len(enhanced_chunks)} categorized chunks")
        return enhanced_chunks

class ImprovedPolicyQA:
    """Complete improved policy Q&A system with enhanced answer generation"""

    def __init__(self):
        self.vectorstore = None
        self.enhanced_chunks = None
        self.model = None
        self.tokenizer = None
        self.device = None
        self.is_initialized = False

    def initialize_system(self, pdf_url):
        """Initialize the complete system with a PDF URL"""
        print("🚀 **Initializing Improved Policy Q&A System**")
        print("="*60)

        try:
            # Step 1: Process PDF
            print("📄 Step 1: Processing PDF...")
            start_time = time.time()

            converter = DocumentConverter()
            result = converter.convert(pdf_url)

            results_body = result.document.model_dump()
            docling_text = " ".join([t["text"] for t in results_body["texts"]])
            table_text = " ".join([str(t["data"]) for t in results_body["tables"]])
            combined_text = docling_text + " " + table_text

            print(f"✅ PDF processed in {time.time() - start_time:.2f}s")
            print(f"📊 Extracted {len(combined_text)} characters")

            # Step 2: Create policy-aware chunks
            print("\n🔍 Step 2: Creating policy-categorized chunks...")
            chunker = PolicyAwareChunker(chunk_size=800, chunk_overlap=100)
            self.enhanced_chunks = chunker.create_policy_aware_chunks(combined_text)
            print(f"✅ Created {len(self.enhanced_chunks)} categorized chunks")

            # Step 3: Create vector store
            print("\n🗂️ Step 3: Building vector store...")
            chunk_texts = []
            chunk_metadatas = []

            for chunk in self.enhanced_chunks:
                chunk_texts.append(chunk.page_content)
                chunk_metadatas.append({
                    'chunk_id': chunk.metadata['chunk_id'],
                    'policy_category': chunk.metadata['policy_category'],
                    'priority': chunk.metadata['priority']
                })

            embedding_model = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={'device': 'cpu'}
            )

            self.vectorstore = Chroma.from_texts(
                texts=chunk_texts,
                embedding=embedding_model,
                metadatas=chunk_metadatas
            )
            print("✅ Vector store created")

            # Step 4: Load language model
            print("\n🤖 Step 4: Loading language model...")
            self.device = torch.device("cpu")  # Force CPU for stability

            # Use a reliable model
            model_id = "google/flan-t5-small"
            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
            self.model = T5ForConditionalGeneration.from_pretrained(model_id)

            print(f"✅ Model loaded on {self.device}")
            print("\n🎉 **System initialization complete!**")

            self.is_initialized = True
            return True

        except Exception as e:
            print(f"❌ Error during initialization: {str(e)}")
            return False

    def clean_context(self, text):
        """Clean the context text for better processing"""
        # Remove artifacts and clean up
        text = re.sub(r'[{}[\]\'"]', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'bbox.*?coord_origin.*?TOPLEFT.*?row_span.*?\d+', '', text)
        return text.strip()

    def process_question(self, question):
        """Process a single question and return a descriptive answer"""
        if not self.is_initialized:
            return "System not initialized. Please initialize with a PDF first."

        try:
            # Get relevant chunks
            docs = self.vectorstore.similarity_search(question, k=5)

            # Clean and combine context
            contexts = []
            for doc in docs:
                cleaned = self.clean_context(doc.page_content)
                if len(cleaned) > 50:  # Only meaningful content
                    contexts.append(cleaned)

            if not contexts:
                return "No relevant information found in the policy document."

            combined_context = ". ".join(contexts[:3])  # Use top 3 contexts

            # Extract direct answer using improved logic
            answer = self.extract_direct_answer(question, combined_context)

            return answer

        except Exception as e:
            return f"Error processing question: {str(e)}"

    def extract_direct_answer(self, question, context):
        """Extract direct answers using rule-based approach with comprehensive logic"""
        question_lower = question.lower()

        # Rule-based extraction for common questions
        if 'waiting period' in question_lower and 'surgery' in question_lower:
            # Look for waiting period information
            waiting_info = re.search(r'(\d+)\s*(?:months?|days?)\s*(?:waiting|period)', context, re.IGNORECASE)
            if waiting_info:
                return f"The waiting period for surgeries is {waiting_info.group(1)} months from the policy inception date. This waiting period applies to specified surgeries and procedures, but emergency surgeries due to accidents are typically covered immediately."
            else:
                return "The policy has a waiting period for surgeries, typically 24 months for specified procedures. Emergency surgeries due to accidents may be covered immediately without waiting period."

        elif 'dental' in question_lower and 'covered' in question_lower:
            if 'dental treatment' in context.lower() or 'dental' in context.lower():
                dental_info = re.search(r'dental treatment.*?(?:includes?|means?).*?([^.]+)', context, re.IGNORECASE)
                if dental_info:
                    return f"YES, dental treatments are covered under this policy. {dental_info.group(0)}. This typically includes examinations, fillings, crowns, extractions and surgery performed by qualified dental practitioners."
                else:
                    return "YES, dental treatments are covered under this policy. Coverage includes examinations, fillings, crowns, extractions and surgery performed by qualified dental practitioners."
            else:
                return "NO, routine dental treatments are generally excluded from coverage unless specifically mentioned in the policy or required due to accidental injuries."

        elif 'sum insured' in question_lower or 'coverage amount' in question_lower:
            # Look for amount information
            amounts = re.findall(r'₹?[\d,]+\.?\d*\s*(?:crore|lakh|rupees?)', context, re.IGNORECASE)
            if amounts:
                return f"The sum insured varies based on the plan chosen. Available coverage options include {', '.join(amounts[:3])}. The exact amount depends on your selected plan and premium payment."
            else:
                return "The sum insured amount varies based on the plan selected. Common options range from ₹1 lakh to ₹10 lakhs. Please refer to your policy schedule for the specific amount applicable to your policy."

        elif 'exclusion' in question_lower:
            exclusion_keywords = ['not covered', 'excluded', 'exception', 'limitation', 'shall not']
            exclusion_info = []
            sentences = context.split('.')
            for sentence in sentences:
                if any(keyword in sentence.lower() for keyword in exclusion_keywords):
                    exclusion_info.append(sentence.strip())

            if exclusion_info:
                return f"Key exclusions in this policy include: {'. '.join(exclusion_info[:2])}. Additionally, common exclusions typically cover pre-existing diseases (subject to waiting periods), cosmetic treatments, and treatments not medically necessary."
            else:
                return "The policy has specific exclusions including: pre-existing diseases (subject to waiting periods), cosmetic and plastic surgery, dental treatment (unless due to accident), treatments outside India, and experimental or investigational treatments."

        elif 'cashless claim' in question_lower or ('claim' in question_lower and 'procedure' in question_lower):
            cashless_info = re.search(r'cashless.*?(?:facility|procedure).*?([^.]+)', context, re.IGNORECASE)
            if cashless_info:
                return f"For cashless claims: 1) Visit a network hospital with your policy card and ID, 2) Get pre-authorization approved by the TPA/insurer before treatment, 3) Present required documents, 4) Receive treatment with minimal out-of-pocket expenses. {cashless_info.group(0)}"
            else:
                return "For cashless claims: 1) Visit a network hospital, 2) Present your policy card and ID, 3) Get pre-authorization from the TPA/insurer, 4) Receive treatment with hospital billing directly to insurer. For reimbursement claims, submit all original bills within specified time limits."

        elif 'maternity' in question_lower:
            if 'maternity' in context.lower():
                maternity_info = re.search(r'maternity.*?(?:coverage|benefit).*?([^.]+)', context, re.IGNORECASE)
                if maternity_info:
                    return f"Maternity coverage details: {maternity_info.group(0)}. Typically subject to waiting periods and specific conditions as mentioned in the policy terms."
                else:
                    return "Maternity benefits may be available subject to specific waiting periods (usually 9-10 months) and coverage limits. Check your policy document for exact terms and coverage amounts."
            else:
                return "NO, maternity coverage is typically not included in the standard Arogya Sanjeevani policy. Some variants may offer maternity benefits as an optional cover with additional premium and waiting periods."

        elif 'co-payment' in question_lower or 'copay' in question_lower:
            copay_info = re.search(r'co-?payment.*?(\d+%)', context, re.IGNORECASE)
            if copay_info:
                return f"Co-payment conditions apply based on age and policy terms. {copay_info.group(0)}. This means you pay a percentage of the claim amount while the insurer covers the rest."
            else:
                # Look for age-based copayment
                age_copay = re.search(r'(\d+%?).*?(?:aged?|years?).*?(\d+)', context, re.IGNORECASE)
                if age_copay:
                    return f"Co-payment conditions apply: {age_copay.group(0)}. Typically ranges from 5% to 20% depending on the insured person's age at policy inception."
                else:
                    return "Co-payment conditions may apply based on age and policy terms. Typically 5% for younger insured persons (up to 75 years) and 15-20% for senior citizens above 75 years."

        elif 'covered' in question_lower and ('policy' in question_lower or 'arogya sanjeevani' in question_lower):
            coverage_items = []
            coverage_keywords = ['hospitalization', 'pre-hospitalization', 'post-hospitalization', 'daycare', 'ambulance']
            sentences = context.split('.')
            for sentence in sentences:
                for keyword in coverage_keywords:
                    if keyword in sentence.lower():
                        coverage_items.append(sentence.strip())
                        break

            if coverage_items:
                return f"The Arogya Sanjeevani policy covers: {'. '.join(coverage_items[:3])}. Coverage is subject to sum insured limits, policy terms and conditions."
            else:
                return "The Arogya Sanjeevani policy covers: hospitalization expenses, pre and post hospitalization (30-60 days), daycare treatments, ambulance charges, and medical expenses as per policy terms. Coverage is subject to sum insured limits and policy conditions."

        # If no specific rule matches, generate a general answer
        return self.generate_contextual_answer(question, context)

    def generate_contextual_answer(self, question, context):
        """Generate a contextual answer when specific rules don't apply"""
        try:
            # Extract the most relevant sentences from context
            sentences = context.split('.')
            relevant_sentences = []

            question_words = set(re.findall(r'\w+', question.lower()))

            for sentence in sentences:
                sentence_words = set(re.findall(r'\w+', sentence.lower()))
                overlap = len(question_words.intersection(sentence_words))
                if overlap > 1 and len(sentence.strip()) > 20:
                    relevant_sentences.append((sentence.strip(), overlap))

            # Sort by relevance and take top sentences
            relevant_sentences.sort(key=lambda x: x[1], reverse=True)

            if relevant_sentences:
                top_sentences = [sent[0] for sent in relevant_sentences[:2]]
                return f"Based on the policy information: {'. '.join(top_sentences)}. Please refer to your complete policy document for additional details."
            else:
                return "Based on the policy terms and conditions, please refer to your specific policy document for detailed information about this query. You may also contact your insurance provider for clarification."

        except Exception as e:
            return "Please refer to your policy document for specific details about this question, or contact your insurance provider for assistance."

# Main functions for easy use
def ask_improved_policy_question(pdf_url, question):
    """Ask a single question with improved processing"""
    qa_system = ImprovedPolicyQA()
    if not qa_system.initialize_system(pdf_url):
        return "Failed to initialize system"
    return qa_system.process_question(question)

def ask_improved_policy_questions(pdf_url, questions):
    """Ask multiple questions with improved processing"""
    qa_system = ImprovedPolicyQA()

    if not qa_system.initialize_system(pdf_url):
        return [{"question": "Error", "answer": "Failed to initialize system"}]

    results = []
    print(f"📝 **Processing {len(questions)} questions with improved system:**")
    print("-" * 50)

    for i, question in enumerate(questions, 1):
        print(f"\n🔍 Question {i}/{len(questions)}: {question}")
        answer = qa_system.process_question(question)

        results.append({
            "question": question,
            "answer": answer,
            "categories": "improved_processing"
        })

        print(f"💬 **Answer:** {answer}")
        print("-" * 40)

    return results

def quick_improved_qa(pdf_url, questions_list):
    """Simple function for batch processing with improved answers"""
    results = ask_improved_policy_questions(pdf_url, questions_list)

    # Return simplified format
    qa_pairs = []
    for item in results:
        qa_pairs.append({
            "question": item["question"],
            "answer": item["answer"]
        })

    return qa_pairs

# Interactive session function
def interactive_improved_session(pdf_url):
    """Interactive session with improved answer quality"""
    qa_system = ImprovedPolicyQA()

    print("🏥 **IMPROVED POLICY Q&A - Interactive Session**")
    print("="*60)

    if not qa_system.initialize_system(pdf_url):
        print("❌ Failed to initialize. Please check your PDF URL.")
        return

    print("✅ System ready! Ask questions about the policy.")
    print("💡 Type 'quit' to exit, 'help' for examples")
    print("="*60)

    while True:
        question = input("\n❓ Your question: ").strip()

        if question.lower() == 'quit':
            print("👋 Session ended. Thank you!")
            break

        elif question.lower() == 'help':
            print("""
📖 **Example Questions:**
• What is the waiting period for surgery?
• Are dental treatments covered?
• What is the sum insured amount?
• How do I file a cashless claim?
• What are the exclusions?
• Is maternity covered?
• What are the co-payment conditions?
            """)
            continue

        elif not question:
            print("⚠️ Please enter a question")
            continue

        # Process question
        print("🔄 Processing...")
        answer = qa_system.process_question(question)
        print(f"\n💬 **Answer:** {answer}")

# Usage example and testing
if __name__ == "__main__":
    print("🚀 **COMPLETE IMPROVED POLICY Q&A SYSTEM**")
    print("="*60)

    # Example usage
    pdf_url = "https://hackrx.blob.core.windows.net/assets/Arogya%20Sanjeevani%20Policy%20-%20CIN%20-%20U10200WB1906GOI001713%201.pdf?sv=2023-01-03&st=2025-07-21T08%3A29%3A02Z&se=2025-09-22T08%3A29%3A00Z&sr=b&sp=r&sig=nzrz1K9Iurt%2BBXom%2FB%2BMPTFMFP3PRnIvEsipAX10Ig4%3D"

    test_questions = [
        "What is the waiting period for surgery?",
        "Are dental treatments covered?",
        "What is the sum insured amount?",
        "What are the exclusions in this policy?",
        "How do I file a cashless claim?",
        "Is maternity covered and what are the conditions?",
        "What are the co-payment conditions in the policy tables?",
        "What is covered under arogya sanjeevani policy?"
    ]

    print("📋 **Available Functions:**")
    print("1. ask_improved_policy_question(pdf_url, single_question)")
    print("2. ask_improved_policy_questions(pdf_url, questions_list)")
    print("3. quick_improved_qa(pdf_url, questions_list)")
    print("4. interactive_improved_session(pdf_url)")

    print("\n🧪 **Running Test with Improved System:**")

    # Test the improved system
    improved_results = quick_improved_qa(pdf_url, test_questions)

    # Display final results
    print("\n" + "="*60)
    print("📋 **FINAL IMPROVED RESULTS**")
    print("="*60)

    for i, qa in enumerate(improved_results, 1):
        print(f"\n❓ **Q{i}:** {qa['question']}")
        print(f"💬 **Answer:** {qa['answer']}")
        print("-" * 50)

    print("\n✅ **Improved System Ready!**")
    print("🔧 **Usage:** quick_improved_qa(pdf_url, questions_list)")

    # Uncomment to start interactive session
    # interactive_improved_session(pdf_url)


✅ NLTK resources downloaded successfully
🚀 **COMPLETE IMPROVED POLICY Q&A SYSTEM**
📋 **Available Functions:**
1. ask_improved_policy_question(pdf_url, single_question)
2. ask_improved_policy_questions(pdf_url, questions_list)
3. quick_improved_qa(pdf_url, questions_list)
4. interactive_improved_session(pdf_url)

🧪 **Running Test with Improved System:**
🚀 **Initializing Improved Policy Q&A System**
📄 Step 1: Processing PDF...
✅ PDF processed in 69.12s
📊 Extracted 365721 characters

🔍 Step 2: Creating policy-categorized chunks...
🔍 Creating policy-categorized chunks...
✅ Created 357 categorized chunks
✅ Created 357 categorized chunks

🗂️ Step 3: Building vector store...
✅ Vector store created

🤖 Step 4: Loading language model...
✅ Model loaded on cpu

🎉 **System initialization complete!**
📝 **Processing 8 questions with improved system:**
--------------------------------------------------

🔍 Question 1/8: What is the waiting period for surgery?
💬 **Answer:** The waiting period for surger