In [12]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import spacy
import re
import os
import re
import json
import nltk
import pdfplumber
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
    You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array.

    Chunk: {chunk}
    Metadata: {metadata}

    Example output:
    [{{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"}},
     {{"question": "What is the annual maximum?", "answer": "$750"}}]
    """
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
nlp = spacy.load("en_core_web_sm")

loader = PyPDFLoader("datastore/phi-basic.pdf")
pages = loader.load()  # Each page is one large block

vector_store = FAISS.from_documents(pages, embedding_model)



retrieved_docs = vector_store.similarity_search("drug coverage", k=3)
print("Retrieved:\n", [d.page_content[:300] for d in retrieved_docs])

#from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def is_complex_chunk(text):
    return bool(re.search(r'exclusion|limitation|condition', text, re.IGNORECASE))

def generate_llm_qa(text, metadata):
    try:
        formatted_prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()

        response = llm([HumanMessage(content=formatted_prompt)]).content
        print("\n🧾 Raw LLM Output:\n", response)

        try:
            qa_list = json.loads(response)
        except:
            start, end = response.find("["), response.rfind("]") + 1
            qa_list = json.loads(response[start:end]) if start != -1 and end != -1 else []

        valid_qas = []
        for qa in qa_list:
            if isinstance(qa, dict) and "question" in qa and "answer" in qa:
                valid_qas.append(qa)
            else:
                print("⚠️ Skipping malformed QA:", repr(qa))
        return valid_qas
    except Exception as e:
        print("❌ LLM QA generation failed:", e)
        return []

def generate_template_qa(text, metadata):
    pattern = r'(\w+)\s+reimbursement|maximum of \$([\d,]+)'
    matches = re.findall(pattern, text)
    qa_pairs = []
    for m in matches:
        entity = m[0] or m[1]
        question = f"What is the {entity} reimbursement rate for the {metadata['plan_type']} plan?"
        qa_pairs.append({"question": question, "answer": text[:200]})
    return qa_pairs


def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def semantic_chunk(text, model, threshold=0.75):
    sentences = spacy_sent_tokenize(text)
    if not sentences:
        return []
    embeddings = model.encode(sentences)
    chunks = []
    current = [sentences[0]]
    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
        if sim < threshold:
            chunks.append(" ".join(current))
            current = [sentences[i]]
        else:
            current.append(sentences[i])
    if current:
        chunks.append(" ".join(current))
    return chunks

embedder = SentenceTransformer("all-MiniLM-L6-v2")
late_chunks = []
for doc in retrieved_docs:
    late_chunks.extend(semantic_chunk(doc.page_content, embedder))


#from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def semantic_chunk(text, model, threshold=0.75):
    sentences = spacy_sent_tokenize(text)
    if not sentences:
        return []
    embeddings = model.encode(sentences)
    chunks = []
    current = [sentences[0]]
    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
        if sim < threshold:
            chunks.append(" ".join(current))
            current = [sentences[i]]
        else:
            current.append(sentences[i])
    if current:
        chunks.append(" ".join(current))
    return chunks

embedder = SentenceTransformer("all-MiniLM-L6-v2")
late_chunks = []
for doc in retrieved_docs:
    late_chunks.extend(semantic_chunk(doc.page_content, embedder))

qa_dataset = []
for chunk in late_chunks:
    metadata = {"source_file": "phi-basic.pdf", "plan_type": "basic"}
    qa_pairs = generate_llm_qa(chunk, metadata) if is_complex_chunk(chunk) else generate_template_qa(chunk, metadata)

    for qa in qa_pairs:
        if isinstance(qa, dict) and "question" in qa and "answer" in qa:
            qa_dataset.append({
                "question": qa["question"],
                "answer": qa["answer"],
                "source_file": "phi-basic.pdf",
                "plan_type": "basic"
            })

with open("raft_qa_dataset_late_chunking.json", "w") as f:
    json.dump(qa_dataset, f, indent=2)


Retrieved:
 ['4 \nDrug provision \nPrescription drugs \nDrugs covered under this plan must have a Drug Identification Number (DIN). \nWe will cover the cost of the following drugs and supplies that are prescribed by a physician or \ndentist and are obtained from a pharmacist:  \n• drugs that legally require a prescript', '2 \nPlan summary  \n \n \nNote: \nWe will only reimburse medical expenses that are not covered by the insured person’s provincial \no\nr territorial health care plan. \n \n \nDrug \nThe amount we pay for the dispensing fee reimbursement is 100% but is limited to a maximum of $5 \np\ner prescription. \nDrug  (for ', '10 \nWe confirm whether the ex pense you submitted is an eligible expense. We determine if there are \nany limitations and exclusions which are described in the applicable provisions. If any of the \nexpenses aren’t eligible, we subtract that expense from the total amount you are claiming. \nFor each el']


  response = llm([HumanMessage(content=formatted_prompt)]).content



🧾 Raw LLM Output:
 [
    {"question": "What types of drugs will not be paid for even when prescribed?", "answer": "Drugs for the treatment of infertility, drugs for the treatment of sexual dysfunction, anti-obesity drugs, dietary supplements, infant formulas, minerals, proteins, vitamins, collagen treatments, contraceptives, the cost of giving injections, serums, vaccines, over-the-counter products designed to help quit smoking"},
    {"question": "Where can I find more information about what expenses will not be covered?", "answer": "In the 'When we will not pay (exclusions)' section of the Other information about your policy pages"},
    {"question": "What are some examples of expenses that will not be covered?", "answer": "Expenses incurred under any of the conditions specified in the 'When we will not pay (exclusions)' section of the Other information about your policy pages"}
]

🧾 Raw LLM Output:
 [
    {"question": "What does the document determine?", "answer": "Limitations and 

In [2]:
pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-macosx_14_0_arm64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp312-cp312-macosx_14_0_arm64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
