In [None]:
import tiktoken
import os
import re
import json
import nltk
import pdfplumber
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import PromptTemplate

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
    You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array.

    Chunk: {chunk}
    Metadata: {metadata}

    Example output:
    [{{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"}},
     {{"question": "What is the annual maximum?", "answer": "$750"}}]
    """
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)

def is_complex_chunk(text):
    return bool(re.search(r'exclusion|limitation|condition', text, re.IGNORECASE))

def generate_llm_qa(text, metadata):
    try:
        formatted_prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()

        response = llm([HumanMessage(content=formatted_prompt)]).content
        print("\n🧾 Raw LLM Output:\n", response)

        try:
            qa_list = json.loads(response)
        except:
            start, end = response.find("["), response.rfind("]") + 1
            qa_list = json.loads(response[start:end]) if start != -1 and end != -1 else []

        valid_qas = []
        for qa in qa_list:
            if isinstance(qa, dict) and "question" in qa and "answer" in qa:
                valid_qas.append(qa)
            else:
                print("⚠️ Skipping malformed QA:", repr(qa))
        return valid_qas
    except Exception as e:
        print("❌ LLM QA generation failed:", e)
        return []

def generate_template_qa(text, metadata):
    pattern = r'(\w+)\s+reimbursement|maximum of \$([\d,]+)'
    matches = re.findall(pattern, text)
    qa_pairs = []
    for m in matches:
        entity = m[0] or m[1]
        question = f"What is the {entity} reimbursement rate for the {metadata['plan_type']} plan?"
        qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs


def get_tokenizer(model="gpt-3.5-turbo"):
    return tiktoken.encoding_for_model(model)

tokenizer = get_tokenizer()

def context_window_chunk(text, max_tokens=500, overlap=100, model="gpt-3.5-turbo"):
    tokenizer = get_tokenizer(model)
    tokens = tokenizer.encode(text)
    chunks = []

    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text.strip())
        start += max_tokens - overlap

    return chunks

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("datastore/phi-basic.pdf")
pages = loader.load()

context_chunks = []
for page in pages:
    page_chunks = context_window_chunk(page.page_content, max_tokens=500, overlap=100)
    context_chunks.extend(page_chunks)


qa_dataset = []
for chunk in context_chunks:
    metadata = {"source_file": "phi-basic.pdf", "plan_type": "basic"}
    qa_pairs = generate_llm_qa(chunk, metadata) if is_complex_chunk(chunk) else generate_template_qa(chunk, metadata)

    for qa in qa_pairs:
        if isinstance(qa, dict) and "question" in qa and "answer" in qa:
            qa_dataset.append({
                "question": qa["question"],
                "answer": qa["answer"],
                "source_file": "phi-basic.pdf",
                "plan_type": "basic"
            })


  from .autonotebook import tqdm as notebook_tqdm


📄 Processing: phi-basic.pdf


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/nandhinirajasekaran/nltk_data'
    - '/Users/nandhinirajasekaran/Desktop/LLM/RAFT/.venv/nltk_data'
    - '/Users/nandhinirajasekaran/Desktop/LLM/RAFT/.venv/share/nltk_data'
    - '/Users/nandhinirajasekaran/Desktop/LLM/RAFT/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [2]:
with open("raft_qa_dataset_context_chunking.json", "w") as f:
    json.dump(qa_dataset, f, indent=2)

print(f"✅ Saved {len(qa_dataset)} QA pairs using context-window aware chunking.")


✅ Saved 68 QA pairs using context-window aware chunking.
