In [3]:
import os
import re
import json
import nltk
import pdfplumber
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate

nltk.download('punkt')
load_dotenv()

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)

chunking_prompt = PromptTemplate(
    input_variables=["text"],
    template="""
You are a document analyzer. Split the following insurance policy text into 2–5 coherent and self-contained chunks, each suitable for generating question-answer pairs. Each chunk should:
- Be logically grouped by topic (e.g., dental, drug, exclusions)
- Not exceed ~1000 characters
- Be returned as a JSON list: ["chunk1...", "chunk2...", ...]

Text:
{text}
"""
)

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
    You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array.

    Chunk: {chunk}
    Metadata: {metadata}

    Example output:
    [{{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"}},
     {{"question": "What is the annual maximum?", "answer": "$750"}}]
    """
)

def is_complex_chunk(text):
    return bool(re.search(r'exclusion|limitation|condition', text, re.IGNORECASE))

def generate_llm_qa(text, metadata):
    try:
        formatted_prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()

        response = llm([HumanMessage(content=formatted_prompt)]).content
        print("\n🧾 Raw LLM Output:\n", response)

        try:
            qa_list = json.loads(response)
        except:
            start, end = response.find("["), response.rfind("]") + 1
            qa_list = json.loads(response[start:end]) if start != -1 and end != -1 else []

        valid_qas = []
        for qa in qa_list:
            if isinstance(qa, dict) and "question" in qa and "answer" in qa:
                valid_qas.append(qa)
            else:
                print("⚠️ Skipping malformed QA:", repr(qa))
        return valid_qas
    except Exception as e:
        print("❌ LLM QA generation failed:", e)
        return []

def generate_template_qa(text, metadata):
    pattern = r'(\w+)\s+reimbursement|maximum of \$([\d,]+)'
    matches = re.findall(pattern, text)
    qa_pairs = []
    for m in matches:
        entity = m[0] or m[1]
        question = f"What is the {entity} reimbursement rate for the {metadata['plan_type']} plan?"
        qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs

from langchain_core.messages import HumanMessage

def llm_informed_chunk(text):
    prompt = chunking_prompt.format_prompt(text=text).to_string()
    try:
        response = llm([HumanMessage(content=prompt)]).content
        print("🧾 Raw chunking response:\n", response)

        # Extract JSON list from response
        try:
            chunks = json.loads(response)
        except json.JSONDecodeError:
            start = response.find("[")
            end = response.rfind("]") + 1
            chunks = json.loads(response[start:end]) if start != -1 and end != -1 else []

        # Final check
        return [chunk.strip() for chunk in chunks if isinstance(chunk, str) and len(chunk.strip()) > 100]
    except Exception as e:
        print("❌ LLM chunking failed:", e)
        return []


from langchain.document_loaders import PyPDFLoader

pdf_file = "phi-basic.pdf"
loader = PyPDFLoader(f"datastore/{pdf_file}")
pages = loader.load()

all_chunks = []
for page in pages:
    long_text = page.page_content.strip()
    page_chunks = llm_informed_chunk(long_text)
    all_chunks.extend(page_chunks)


qa_dataset = []
for chunk in all_chunks:
    metadata = {"source_file": pdf_file, "plan_type": "basic"}

    qa_pairs = generate_llm_qa(chunk, metadata) if is_complex_chunk(chunk) else generate_template_qa(chunk, metadata)

    for qa in qa_pairs:
        if isinstance(qa, dict) and "question" in qa and "answer" in qa:
            qa_dataset.append({
                "question": qa["question"],
                "answer": qa["answer"],
                "source_file": pdf_file,
                "plan_type": "basic"
            })



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nandhinirajasekaran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🧾 Raw chunking response:
 [
  "This insurance policy is for Personal Health Insurance with Basic coverage and Semi-Private Hospital accommodation under Series 9.01. The provided policy wording is incomplete and serves as a general reference only. Actual issued policies may contain additional provisions not reflected in this sample. Policy wording is subject to periodic changes, and the actual policy issued to a client will govern the relationship.",
  "The information provided in this sample policy is not to be considered a contract or an offer to contract. It is important to note that the wording in the actual issued policy may differ from this sample. Clients should refer to their specific policy for accurate details and coverage information. Any discrepancies between this sample wording and the issued policy will be resolved in favor of the policy terms."
]
🧾 Raw chunking response:
 [
  "Chunk 1: Introduction and Definitions",
  "Sun Life Assurance Company of Canada agrees to provid

In [4]:
with open("raft_qa_dataset_llm_chunked.json", "w") as f:
    json.dump(qa_dataset, f, indent=2)

print(f"✅ Saved {len(qa_dataset)} QA pairs using LLM-Informed Chunking.")


✅ Saved 61 QA pairs using LLM-Informed Chunking.
