In [18]:
import spacy
import tiktoken
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

nlp = spacy.load("en_core_web_sm")
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


def hybrid_semantic_context_chunk(text, token_limit=500, similarity_threshold=0.75):
    sentences = spacy_sent_tokenize(text)
    if not sentences:
        return []

    embeddings = embedder.encode(sentences)
    chunks = []
    current = [sentences[0]]
    current_tokens = len(tokenizer.encode(sentences[0]))

    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
        sentence_tokens = len(tokenizer.encode(sentences[i]))

        # If adding this sentence will exceed token limit or it's not semantically close
        if sim < similarity_threshold or (current_tokens + sentence_tokens > token_limit):
            chunks.append(" ".join(current))
            current = [sentences[i]]
            current_tokens = sentence_tokens
        else:
            current.append(sentences[i])
            current_tokens += sentence_tokens

    if current:
        chunks.append(" ".join(current))

    return chunks


In [36]:
import os
import json
import re
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken

# === CONFIG ===
DOCS_DIR = "datastore/"
OUTPUT_FILE = "raft_qa_dataset_hybrid.json"
MODEL_NAME = "gpt-3.5-turbo"
TOKEN_LIMIT = 600
OVERLAP = 30
SIM_THRESHOLD = 0.75

nlp = spacy.load("en_core_web_sm")
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# === COMPONENTS ===
llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0.5)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = tiktoken.encoding_for_model(MODEL_NAME)

qa_prompt = PromptTemplate(
    input_variables=["chunk", "metadata"],
    template="""
You are a JSON generator for question-answer pairs. Given the document chunk and metadata below, generate 2-3 QA pairs in valid JSON format. Output ONLY the JSON array, with no additional text, explanations, or markdown.

Chunk: {chunk}
Metadata: {metadata}

Example output:
[{{"question": "What is the drug reimbursement rate?", "answer": "60% reimbursement"}},
 {{"question": "What is the annual maximum?", "answer": "$750"}}]
"""
)

# === HELPERS ===
def is_complex_chunk(text):
    return bool(re.search(r"exclusion|limitation|condition", text, re.IGNORECASE))

def clean_json_response(response):
    if hasattr(response, "content"):
        response = response.content
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        start = response.find("[")
        end = response.rfind("]") + 1
        if start != -1 and end != -1:
            try:
                return json.loads(response[start:end])
            except:
                pass
        print("⚠️ Invalid response:\n", response)
        return []

def generate_llm_qa(text, metadata):
    try:
        prompt = qa_prompt.format_prompt(
            chunk=text[:1000],
            metadata=json.dumps(metadata)
        ).to_string()
        response = llm([HumanMessage(content=prompt)])
        return clean_json_response(response)
    except Exception as e:
        print(f"❌ LLM QA generation failed: {e}")
        return []

def hybrid_semantic_context_chunk(text, token_limit=TOKEN_LIMIT, similarity_threshold=SIM_THRESHOLD):
    sentences = spacy_sent_tokenize(text)
    if not sentences:
        return []

    embeddings = embedder.encode(sentences)
    chunks = []
    current = [sentences[0]]
    current_tokens = len(tokenizer.encode(sentences[0]))

    for i in range(1, len(sentences)):
        sim = cosine_similarity([embeddings[i-1]], [embeddings[i]])[0][0]
        sentence_tokens = len(tokenizer.encode(sentences[i]))

        if sim < similarity_threshold or (current_tokens + sentence_tokens > token_limit):
            chunks.append(" ".join(current))
            current = [sentences[i]]
            current_tokens = sentence_tokens
        else:
            current.append(sentences[i])
            current_tokens += sentence_tokens

    if current:
        chunks.append(" ".join(current))

    return chunks

# === MAIN PIPELINE ===
qa_dataset = []

for pdf_file in os.listdir(DOCS_DIR):
    if not pdf_file.endswith(".pdf"):
        continue

    print(f"📄 Processing: {pdf_file}")
    file_path = os.path.join(DOCS_DIR, pdf_file)
    loader = PyPDFLoader(file_path)
    pages = loader.load()

    plan_type = "basic" if "basic" in pdf_file.lower() else "standard" if "standard" in pdf_file.lower() else "enhanced"

    for page in pages:
        chunks = hybrid_semantic_context_chunk(page.page_content)
        for chunk in chunks:
            metadata = {"source_file": pdf_file, "plan_type": plan_type}
            qa_pairs = generate_llm_qa(chunk, metadata)

            for qa in qa_pairs:
                if not isinstance(qa, dict) or "question" not in qa or "answer" not in qa:
                    print("⚠️ Skipping malformed QA:", qa)
                    continue
                qa_dataset.append({
                    "question": qa["question"],
                    "answer": qa["answer"],
                    "source_file": metadata["source_file"],
                    "plan_type": metadata["plan_type"]
                })

# === SAVE OUTPUT ===
with open(OUTPUT_FILE, "w") as f:
    json.dump(qa_dataset, f, indent=2)

print(f"\n✅ Saved {len(qa_dataset)} QA pairs to {OUTPUT_FILE}")


📄 Processing: phi-basic.pdf

✅ Saved 792 QA pairs to raft_qa_dataset_hybrid.json


In [41]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def deduplicate_qa_by_question(qa_dataset, threshold=0.8):
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    questions = [qa["question"] for qa in qa_dataset]
    
    # ⬇️ Move tensor from MPS to CPU before numpy conversion
    embeddings = embedder.encode(questions, convert_to_tensor=True).cpu().numpy()
    sim_matrix = cosine_similarity(embeddings)

    keep_indices = []
    seen = set()

    for i in range(len(qa_dataset)):
        if i in seen:
            continue
        keep_indices.append(i)
        for j in range(i + 1, len(qa_dataset)):
            if sim_matrix[i][j] >= threshold:
                seen.add(j)

    return [qa_dataset[i] for i in keep_indices]


def exact_deduplicate(qa_dataset):
    seen = set()
    deduped = []
    for qa in qa_dataset:
        key = (qa["question"].strip().lower(), qa["answer"].strip().lower())
        if key not in seen:
            seen.add(key)
            deduped.append(qa)
    return deduped



In [45]:
qa_dataset_new = exact_deduplicate(qa_dataset)
qa_dataset_new1 = deduplicate_qa_by_question(qa_dataset_new, threshold=0.8)

In [47]:
# === SAVE OUTPUT ===

OUTPUT_FILE = "raft_qa_dataset_hybrid_v1.json"

with open(OUTPUT_FILE, "w") as f:
    json.dump(qa_dataset_new1, f, indent=2)

print(f"\n✅ Saved {len(qa_dataset)} QA pairs to {OUTPUT_FILE}")


✅ Saved 792 QA pairs to raft_qa_dataset_hybrid_v1.json
