In [1]:
import re
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [32]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def semantic_chunk(text, model, threshold=0.75):
    sentences = spacy_sent_tokenize(text)
    if not sentences:
        return []
    embeddings = model.encode(sentences)
    chunks = []
    current_chunk = [sentences[0]]
    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
        if sim < threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [33]:
def table_chunk(text):
    lines = text.split('\n')
    table_lines = [line for line in lines if re.search(r'\b\d{1,3}%|\$\d+', line)]
    return ["\n".join(table_lines)] if table_lines else []


In [34]:
def fixed_chunk(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


In [35]:
def choose_chunking_strategy(text):
    if re.search(r'\b\d{1,3}%|\$\d+', text):  # Table-like
        return "table"
    elif len(text.split()) > 800:  # Long text fallback
        return "fixed"
    else:
        return "semantic"


In [6]:
class AgenticChunker:
    def __init__(self, embedder):
        self.model = embedder

    def chunk(self, text):
        strategy = choose_chunking_strategy(text)
        if strategy == "table":
            return table_chunk(text)
        elif strategy == "fixed":
            return fixed_chunk(text)
        else:
            return semantic_chunk(text, self.model)


In [36]:
from langchain.document_loaders import PyPDFLoader

embedder = SentenceTransformer("all-MiniLM-L6-v2")
agentic_chunker = AgenticChunker(embedder)
pdf_path = "datastore/phi-basic.pdf"  # ← replace with actual PDF path

loader = PyPDFLoader(pdf_path)
pages = loader.load()

for page in pages:
    page_text = page.page_content.strip()
    chunks = agentic_chunker.chunk(page_text)
    for chunk in chunks:
        print("🔹", chunk[:200], "...\n---")


🔹 Persona l Health  Insurance  
Basic with Semi - Private Hospital  
Series 9 .01  
SAMPLE ...
---
🔹 The following policy wording is provided solely for your convenience and reference. ...
---
🔹 It is 
incomplete and reflects only some of the general provisions that may be found in some of 
our insurance policies. ...
---
🔹 We periodically make changes to policy wording and therefore this 
incomplete sample may not duplicate the wording of any actual issued policy. ...
---
🔹 It is not to be 
construed or interpreted in any manner as a contract or an offer to contract. ...
---
🔹 The actual 
policy issued to any given client will govern that relationship. ...
---
🔹 PHI Basic with Semi-Private Hospital Policy Sample Series 9.01 2023  
Sun Life Assurance Company of Canada agrees to provide the benefits of this policy according to its 
terms and conditions. ...
---
🔹 In this document, you and your mean the owner of this policy. ...
---
🔹 We, us, our, and the company mean 
Sun Life Assurance

In [22]:
def is_complex_chunk(text):
    return bool(re.search(r'exclusion|limitation|condition', text, re.IGNORECASE))

question_templates = [
    {"question": "What is the {entity} reimbursement rate for the {plan_type} plan?", "entity_key": r"(\w+)\s+reimbursement"},
    {"question": "What is the annual maximum for {entity} in the {plan_type} plan?", "entity_key": r"maximum of \$([\d,]+)"}
]

def generate_template_qa(text, metadata):
    qa_pairs = []
    entities = re.findall(r'(\w+)\s+reimbursement|maximum of \$([\d,]+)', text, re.IGNORECASE)
    for entity in entities:
        for template in question_templates:
            if re.search(template["entity_key"], text, re.IGNORECASE):
                ent = entity[0] or entity[1]
                question = template["question"].format(entity=ent, plan_type=metadata["plan_type"])
                qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs



In [45]:
from langchain_core.messages import HumanMessage
import json

from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate  # ✅ Required for format_prompt()


llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
from langchain_core.prompts import PromptTemplate

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
    You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array, with no additional text, explanations, or markdown.

    Chunk: {chunk}
    Metadata: {metadata}

    Example output:
    [{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"},
     {"question": "What is the annual maximum?", "answer": "$750"}]
    """
)

def generate_llm_qa(text, metadata):
    try:
        # Format the prompt using PromptTemplate
        formatted_prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()

        # Get response from ChatOpenAI
        response = llm([HumanMessage(content=formatted_prompt)]).content

        # Try direct parse
        try:
            data = json.loads(response)
            if isinstance(data, list) and all("question" in qa and "answer" in qa for qa in data):
                return data
        except json.JSONDecodeError:
            pass

        # Fallback: slice out JSON array
        start = response.find("[")
        end = response.rfind("]") + 1
        if start != -1 and end != -1:
            try:
                data = json.loads(response[start:end])
                if isinstance(data, list) and all("question" in qa and "answer" in qa for qa in data):
                    return data
            except Exception as e:
                print("⚠️ Failed to extract JSON array:", e)

        # Final fallback
        print("⚠️ Invalid JSON structure in LLM output:\n", response)
    except Exception as e:
        print(f"⚠️ LLM QA generation failed: {e}")

    return []

def generate_template_qa(text, metadata):
    qa_pairs = []
    entities = re.findall(r'(\w+)\s+reimbursement|maximum of \$([\d,]+)', text, re.IGNORECASE)
    for entity in entities:
        for template in question_templates:
            if re.search(template["entity_key"], text, re.IGNORECASE):
                ent = entity[0] or entity[1]
                question = template["question"].format(entity=ent, plan_type=metadata["plan_type"])
                qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs



In [46]:
qa_dataset = []

pdf_file = "phi-basic.pdf"
plan_type = "basic"  # Or infer from filename if needed
loader = PyPDFLoader(f"datastore/{pdf_file}")
pages = loader.load()

embedder = SentenceTransformer("all-MiniLM-L6-v2")
agentic_chunker = AgenticChunker(embedder)

for page in pages:
    page_text = page.page_content.strip()
    chunks = agentic_chunker.chunk(page_text)

    for chunk_text in chunks:
        metadata = {
            "plan_type": plan_type,
            "source_file": pdf_file
        }
        # Route to LLM or template QA
        if is_complex_chunk(chunk_text):
            qa_pairs = generate_llm_qa(chunk_text, metadata)
        else:
            qa_pairs = generate_template_qa(chunk_text, metadata)

        for qa in qa_pairs:
            if isinstance(qa, dict) and "question" in qa and "answer" in qa:
                qa_dataset.append({
                    "question": qa["question"],
                    "answer": qa["answer"],
                    "source_file": pdf_file,
                    "plan_type": plan_type
                })
            else:
                print("⚠️ Skipping malformed QA:", repr(qa))


⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'
⚠️ LLM QA generation failed: '"question"'


In [16]:
with open("raft_qa_dataset_agentic.json", "w") as f:
    json.dump(qa_dataset, f, indent=2)

print(f"✅ Generated {len(qa_dataset)} QA pairs using Agentic Chunking.")


✅ Generated 0 QA pairs using Agentic Chunking.


In [42]:
import os
import re
import json
import nltk
import pdfplumber
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import PromptTemplate

nltk.download('punkt')
load_dotenv()


llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
    You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array.

    Chunk: {chunk}
    Metadata: {metadata}

    Example output:
    [{{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"}},
     {{"question": "What is the annual maximum?", "answer": "$750"}}]
    """
)

class AgenticChunker:
    def __init__(self, embedder):
        self.model = embedder

    def chunk(self, text):
        if re.search(r'\b\d{1,3}%|\$\d+', text):
            return [text]  # Table-style fallback
        elif len(text.split()) > 800:
            return [" ".join(text.split()[i:i+500]) for i in range(0, len(text.split()), 500)]
        else:
            return self.semantic_chunk(text)

    import spacy
    nlp = spacy.load("en_core_web_sm")

    def spacy_sent_tokenize(text):
        doc = nlp(text)
        return [sent.text.strip() for sent in doc.sents]
    
    def semantic_chunk(self, text, threshold=0.75):
        sentences = spacy_sent_tokenize(text)
        if not sentences:
            return []
        embeddings = self.model.encode(sentences)
        chunks = []
        current = [sentences[0]]
        for i in range(1, len(sentences)):
            sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
            if sim < threshold:
                chunks.append(" ".join(current))
                current = [sentences[i]]
            else:
                current.append(sentences[i])
        if current:
            chunks.append(" ".join(current))
        return chunks

def is_complex_chunk(text):
    return bool(re.search(r'exclusion|limitation|condition', text, re.IGNORECASE))

def generate_llm_qa(text, metadata):
    try:
        formatted_prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()

        response = llm([HumanMessage(content=formatted_prompt)]).content
        print("\n🧾 Raw LLM Output:\n", response)

        try:
            qa_list = json.loads(response)
        except:
            start, end = response.find("["), response.rfind("]") + 1
            qa_list = json.loads(response[start:end]) if start != -1 and end != -1 else []

        valid_qas = []
        for qa in qa_list:
            if isinstance(qa, dict) and "question" in qa and "answer" in qa:
                valid_qas.append(qa)
            else:
                print("⚠️ Skipping malformed QA:", repr(qa))
        return valid_qas
    except Exception as e:
        print("❌ LLM QA generation failed:", e)
        return []

def generate_template_qa(text, metadata):
    pattern = r'(\w+)\s+reimbursement|maximum of \$([\d,]+)'
    matches = re.findall(pattern, text)
    qa_pairs = []
    for m in matches:
        entity = m[0] or m[1]
        question = f"What is the {entity} reimbursement rate for the {metadata['plan_type']} plan?"
        qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs

pdf_file = "phi-basic.pdf"  # Put in datastore/
loader = PyPDFLoader(f"datastore/{pdf_file}")
pages = loader.load()
plan_type = "basic"

embedder = SentenceTransformer("all-MiniLM-L6-v2")
chunker = AgenticChunker(embedder)

qa_dataset = []

for page in pages:
    chunks = chunker.chunk(page.page_content)
    for chunk in chunks:
        metadata = {"plan_type": plan_type, "source_file": pdf_file}
        qa_pairs = generate_llm_qa(chunk, metadata) if is_complex_chunk(chunk) else generate_template_qa(chunk, metadata)

        for qa in qa_pairs:
            qa_dataset.append({
                "question": qa["question"],
                "answer": qa["answer"],
                "source_file": pdf_file,
                "plan_type": plan_type
            })


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nandhinirajasekaran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  response = llm([HumanMessage(content=formatted_prompt)]).content



🧾 Raw LLM Output:
 
[
    {"question": "What benefits does the PHI Basic policy provide?", "answer": "The PHI Basic policy provides benefits according to its terms and conditions."},
    {"question": "Who is the provider of the PHI Basic policy benefits?", "answer": "Sun Life Assurance Company of Canada"},
    {"question": "What year does the PHI Basic policy Sample Series 9.01 2023 cover?", "answer": "2023"}
]

🧾 Raw LLM Output:
 [
    {"question": "What does the document outline?", "answer": "The benefits payable and exclusions and limitations."},
    {"question": "What type of plan is this?", "answer": "Basic"},
    {"question": "What is the source file of this information?", "answer": "phi-basic.pdf"}
]

🧾 Raw LLM Output:
 
[
    {"question": "What are statutory conditions?", "answer": "Conditions that are mandated by law and must be included in insurance policies."},
    {"question": "What type of plan is this?", "answer": "Basic insurance plan."},
    {"question": "Where can I f

In [44]:
with open("raft_qa_dataset_agentic.json", "w") as f:
    json.dump(qa_dataset, f, indent=2)

print(f"✅ Saved {len(qa_dataset)} QA pairs using agentic chunking.")


✅ Saved 65 QA pairs using agentic chunking.
