<a href="https://colab.research.google.com/github/minahilmemon/skills-introduction-to-github/blob/main/User_Input.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:


import re
import numpy as np
from collections import namedtuple
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# ─── Configuration ──────────────────────────────────────────────────────────────
FALLBACK_SNIPPET_LEN = 300  # show 300 characters for each fallback chunk

# ─── Helpers ────────────────────────────────────────────────────────────────────

def extract_keywords(text):
    tokens = re.findall(r'\w+', text.lower())
    return list(dict.fromkeys(tok for tok in tokens if tok not in ENGLISH_STOP_WORDS))

def highlight_text(text, keywords):
    for kw in keywords:
        pattern = re.compile(rf'\b{re.escape(kw)}\b', flags=re.IGNORECASE)
        text = pattern.sub(lambda m: f"**{m.group(0)}**", text)
    return text

def calibrate_nn_median(embeddings):
    """
    Nearest-neighbor based threshold calibration:
      - Compute all pairwise cosine similarities
      - For each segment, take the 2nd-highest (nearest-neighbor) similarity
      - Return the median of those neighbor similarities
    """
    sims = cosine_similarity(embeddings)
    # for each row, sort and grab the 2nd-largest value (largest is self=1.0)
    nn_sims = np.sort(sims, axis=1)[:, -2]
    return float(np.median(nn_sims))

# ─── Interactive lookup ─────────────────────────────────────────────────────────

def interactive_lookup(
    model,
    segments,
    segment_embeddings,
    similarity_threshold: float,
    boost_weight: float = 0.05
):
    """
    Interactive Q&A with:
      - SBERT embeddings
      - auto-calibrated NN-median threshold
      - keyword filtering & boosting
      - automatic top-3 fallback chunks (300-char each)
      - keyword highlighting
      - metadata display (title)
    """
    assert len(segments) == segment_embeddings.shape[0], \
        "segments length must match embeddings rows"

    # Build inverted index keyword → set(segment indices)
    segment_tokens = [extract_keywords(seg.text) for seg in segments]
    inverted_index = {}
    for idx, toks in enumerate(segment_tokens):
        for tok in toks:
            inverted_index.setdefault(tok, set()).add(idx)

    print(f"🧮 Using similarity threshold = {similarity_threshold:.3f}\n")

    while True:
        q = input("❓ Ask a question (or 'exit'): ").strip()
        if not q:
            continue
        if q.lower() == 'exit':
            print("👋 Goodbye!")
            break

        # 1) Extract & show keywords
        keywords = extract_keywords(q)
        print(f"🔑 Keywords: {keywords}")

        # 2) Candidate filter by keywords
        cands = set()
        for kw in keywords:
            cands |= inverted_index.get(kw, set())
        idx_list = list(cands) if cands else list(range(len(segments)))

        # 3) Embed & score + keyword boost
        q_emb      = model.encode([q], convert_to_tensor=False)
        sims_sub   = cosine_similarity(q_emb, segment_embeddings[idx_list])[0]
        boosts     = np.array([sum(1 for kw in keywords if kw in segment_tokens[i])
                               for i in idx_list])
        final_sims = sims_sub + boost_weight * boosts

        # 4) Pick best
        best_loc   = int(np.argmax(final_sims))
        best_score = float(final_sims[best_loc])
        best_idx   = idx_list[best_loc]

        # 5) If below threshold, auto-fallback to top 3 chunks
        if best_score < similarity_threshold:
            print("\n🤔 Not confident enough—here are the top 3 chunks instead:")
            top3 = np.argsort(final_sims)[-3:][::-1]
            for i, loc in enumerate(top3, 1):
                seg = segments[idx_list[loc]]
                snippet = seg.text.replace("\n", " ")[:FALLBACK_SNIPPET_LEN].rstrip()
                print(f"  {i}. [{seg.section_id}] {seg.title}\n     {snippet}…\n")
            choice = input("Pick 1–3 or Enter to retry: ").strip()
            if choice in ('1','2','3'):
                sel = top3[int(choice)-1]
                best_idx   = idx_list[sel]
                best_score = float(final_sims[sel])
            else:
                print()
                continue

        # 6) Display answer with highlights
        seg = segments[best_idx]
        print(f"\n🔹 Section {seg.section_id} — {seg.title} — Score {best_score:.3f}\n")
        print(highlight_text(seg.text, keywords), "\n" + "—"*40 + "\n")

# ─── Main script ────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    # 1) Read full text
    with open("Section11.2.txt", "r", encoding="utf-8") as f:
        full_text = f.read()

    # 2) Split into chunks and extract metadata
    pattern = r"(11\.2\.\d+(?:\.\d+)*\.)\s*([\s\S]*?)(?=(?:11\.2\.\d|\Z))"
    matches = re.findall(pattern, full_text)
    Segment = namedtuple("Segment", ["section_id", "title", "text"])
    segments = []
    for sec_id, body in matches:
        lines = body.strip().split("\n", 1)
        title = lines[0].strip()
        text  = lines[1].strip() if len(lines) > 1 else ""
        segments.append(Segment(sec_id, title, text))

    # 3) Embed all segments
    model = SentenceTransformer("all-MiniLM-L6-v2")
    texts = [seg.text for seg in segments]
    segment_embeddings = model.encode(texts, convert_to_tensor=False)

    # 4) Auto-calibrate threshold via nearest-neighbor median
    threshold = calibrate_nn_median(segment_embeddings)

    # 5) Launch interactive lookup
    interactive_lookup(model, segments, segment_embeddings, threshold)



🧮 Using similarity threshold = 0.732

❓ Ask a question (or 'exit'): what does a float come in handy for?
🔑 Keywords: ['does', 'float', 'come', 'handy']

🤔 Not confident enough—here are the top 3 chunks instead:
  1. [11.2.1.11.] Specific
     Upgrade  Requirements  for  Float  Homes  and  Marinas  1) Except as permitted by Sentence (2), where a marina is altered, all new work shall comply with Subsection  12.2.2. and the marina shall be upgraded to an acceptable level as determined by the Upgrade Mechanism Model in  Notes to Part 11.  2)…

  2. [11.2.1.3.] , they shall be installed throughout the storey on which the new
     dwelling unit is to be located and all storeys below the new dwelling unit.  Rev.  14107  4) A building need not be sprinklered in accordance with Sentence (1), if the construction value of the alteration  does not exceed $250,000.   Table…

  3. [11.2.1.] 5)
     1) Where an alteration to a building is a self-contained volumetric space that is separated from the r