<a href="https://colab.research.google.com/github/minahilmemon/skills-introduction-to-github/blob/main/MD-User_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
import re
import pdfplumber
import numpy as np
from collections import namedtuple
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1) Define your richer Segment with a page field
Segment = namedtuple("Segment", [
    "section_id",   # e.g. "11.2.1.3."
    "title",        # e.g. "Accessibility Upgrade Requirements"
    "text",         # the clause text
    "page"          # page number in the PDF
])

# 2) Load & chunk the PDF, capturing page numbers
def load_segments_with_page(pdf_path: str):
    pattern = r"(11\.2\.\d+(?:\.\d+)*\.)\s*([\s\S]*?)(?=(?:11\.2\.\d|\Z))"
    segments = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text() or ""
            for sec_id, body in re.findall(pattern, page_text):
                lines = body.strip().split("\n", 1)
                title = lines[0].strip()
                text  = lines[1].strip() if len(lines) > 1 else ""
                segments.append(Segment(sec_id, title, text, page_num))
    return segments

segments = load_segments_with_page("Section11.2.pdf")

# 3) Compute embeddings once
model = SentenceTransformer("intfloat/e5-large-v2")
texts = [f"{seg.section_id} {seg.title}\n{seg.text}" for seg in segments]
embeddings = model.encode(texts, batch_size=16, show_progress_bar=True)

# 4) Calibrate threshold via nearest‐neighbour median
def calibrate_nn_median(embs: np.ndarray) -> float:
    # pairwise cosine similarities
    sims = cosine_similarity(embs)
    # ignore self‐sims
    np.fill_diagonal(sims, -np.inf)
    # best other‐chunk similarity for each chunk
    best_scores = sims.max(axis=1)
    # threshold = median of those best‐scores
    return float(np.median(best_scores))

THRESHOLD = calibrate_nn_median(embeddings)
print(f"Calibrated similarity threshold: {THRESHOLD:.3f}")

# 5) Pair up segments with their embeddings
embedded_segments = list(zip(segments, embeddings))

# 6) Query function using NN threshold
def find_best_match(query: str, fallback_top_k=3):
    q_emb = model.encode([query])[0]
    sims = np.array([cosine_similarity([q_emb], [emb])[0,0]
                     for _, emb in embedded_segments])
    best_idx = sims.argmax()
    if sims[best_idx] >= THRESHOLD:
        seg = embedded_segments[best_idx][0]
        return {
            "answer": seg.text,
            "section": seg.section_id,
            "title": seg.title,
            "page": seg.page,
            "similarity": float(sims[best_idx])
        }
    else:
        top_k_idx = sims.argsort()[-fallback_top_k:][::-1]
        return [{
            "section": embedded_segments[i][0].section_id,
            "title": embedded_segments[i][0].title,
            "page": embedded_segments[i][0].page,
            "similarity": float(sims[i])
        } for i in top_k_idx]

# 7) Run interactively
if __name__ == "__main__":
    user_q = input("Enter your question: ")
    result = find_best_match(user_q)
    if isinstance(result, dict):
        print(f"\n✔ Found answer in {result['section']} (page {result['page']})")
        print(result["answer"])
        print(f"(sim={result['similarity']:.3f})")
    else:
        print("\n⚠ No confident match; here are the top suggestions:")
        for r in result:
            print(f" • {r['section']} (page {r['page']}), sim={r['similarity']:.3f}")
            print(f"   » {r['title']}")
