<a href="https://colab.research.google.com/github/ma2070-spec/Outamation_AI_Externship/blob/main/Task2_Completed_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install -U \
  llama-index llama-index-llms-gemini \
  llama-index-retrievers-bm25 llama-index-embeddings-huggingface \
  sentence-transformers pymupdf jedi


In [None]:
from getpass import getpass
import os
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

# set key once per runtime (hidden prompt)
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass("🔑 Paste your Gemini API key (hidden): ").strip()
print("Key loaded ✅")

# light, quota-friendly defaults
Settings.llm = Gemini(model="gemini-1.5-flash", temperature=0.15, max_tokens=256)
Settings.embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")
Settings.node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
print("Settings ready ✅")


In [None]:
PDF = "LenderFeesWorksheetNew.pdf"

import os
if not os.path.exists(PDF):
    from google.colab import files
    print("Select the Lender Fee Worksheet PDF…")
    uploaded = files.upload()
    PDF = list(uploaded.keys())[0]

from llama_index.core import SimpleDirectoryReader
docs = SimpleDirectoryReader(input_files=[PDF]).load_data()
print(f"Loaded {len(docs)} document(s)")


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Settings

# Vector
vector_index = VectorStoreIndex.from_documents(docs, show_progress=True)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)

# Prepare nodes for BM25 variants
try:
    nodes = Settings.node_parser.get_nodes_from_documents(docs)
except Exception:
    nodes = None

# BM25 import (old/new paths)
try:
    from llama_index.retrievers.bm25 import BM25Retriever
except Exception:
    from llama_index.core.retrievers import BM25Retriever

bm25 = None; last_err = None
# A) modern signature
try:
    bm25 = BM25Retriever.from_defaults(documents=docs, similarity_top_k=6)
except Exception as e:
    last_err = e
# B) nodes signature
if bm25 is None and nodes:
    try:
        bm25 = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=min(6, len(nodes)))
    except Exception as e:
        last_err = e
# C) docstore fallback
if bm25 is None:
    from llama_index.core.storage.docstore import SimpleDocumentStore
    ds = SimpleDocumentStore()
    if hasattr(ds, "add_documents"): ds.add_documents(docs)
    elif hasattr(ds, "add_nodes") and nodes: ds.add_nodes(nodes)
    bm25 = BM25Retriever.from_defaults(docstore=ds, similarity_top_k=6)

# Clamp k so tiny PDFs don’t crash BM25
try:
    corp_size = len(nodes) if nodes else 1
    bm25.similarity_top_k = max(1, min(getattr(bm25, "similarity_top_k", 6), corp_size))
except Exception:
    pass

print(f"Vector@k={getattr(vector_retriever,'similarity_top_k',None)} | BM25@k={getattr(bm25,'similarity_top_k',None)} ✅")

# Manual hybrid (works across all versions)
try:
    from llama_index.core.retrievers import BaseRetriever
except Exception:
    BaseRetriever = object
from llama_index.core.schema import NodeWithScore

class ManualHybridRetriever(BaseRetriever):
    def __init__(self, v, b, alpha=0.5, k=6):
        self.v=v; self.b=b; self.alpha=float(alpha); self.k=int(k)
    def _norm(self, items):
        sc=[(i.score or 0.0) for i in items] or [0.0]
        mn, mx = min(sc), max(sc); rng=(mx-mn) or 1.0
        return {id(i.node):((i.score or 0.0)-mn)/rng for i in items}
    async def _aretrieve(self, q): return self._retrieve(q)
    def _retrieve(self, q):
        vec=self.v.retrieve(q); kw=self.b.retrieve(q)
        nv=self._norm(vec); nk=self._norm(kw)
        merged={}
        for lst in (vec,kw):
            for nws in lst: merged.setdefault(id(nws.node), nws)
        out=[]
        for nid,nws in merged.items():
            score=self.alpha*nv.get(nid,0.0)+(1-self.alpha)*nk.get(nid,0.0)
            out.append(NodeWithScore(node=nws.node, score=score))
        out.sort(key=lambda x: x.score or 0.0, reverse=True)
        return out[:self.k]

hybrid_retriever = ManualHybridRetriever(vector_retriever, bm25, alpha=0.5, k=6)
print("Hybrid retriever ready ✅")


In [None]:
reranker = None
try:
    from llama_index.postprocessor import SentenceTransformerRerank
    reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=3)
except Exception:
    try:
        from llama_index.postprocessor.sentence_transformer_rerank import SentenceTransformerRerank
        reranker = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=3)
    except Exception:
        from sentence_transformers import CrossEncoder
        class CEPostprocessor:
            def __init__(self, model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=3):
                self.ce = CrossEncoder(model); self.top_n = top_n
            def postprocess_nodes(self, nodes, query_bundle):
                q = getattr(query_bundle,"query_str",str(query_bundle))
                txts=[]
                for n in nodes:
                    node = getattr(n,"node",n)
                    txts.append(node.get_text() if hasattr(node,"get_text") else str(getattr(n,"text","")))
                scores = self.ce.predict([[q,t] for t in txts])
                for sc,n in zip(scores,nodes):
                    try: n.score=float(sc)
                    except: pass
                ranked=[n for _,n in sorted(zip(scores,nodes), key=lambda t:t[0], reverse=True)]
                return ranked[:self.top_n]
        reranker = CEPostprocessor(top_n=3)
print("Reranker ready ✅", type(reranker).__name__)


In [None]:
import re, time, textwrap
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Settings

qe = RetrieverQueryEngine(retriever=hybrid_retriever, node_postprocessors=[reranker])

def rewrite_query(q: str) -> str:
    prompt = ("Rewrite the user's question for precise retrieval in a lender fee worksheet. "
              "Add key synonyms; keep it short.\n\n"
              f"User: {q}\nRewritten:")
    return Settings.llm.complete(prompt).text.strip()

def ask_rag(q: str, expand=True, retries=3, base_sleep=1.2):
    q2 = rewrite_query(q) if expand else q
    for i in range(retries):
        try:
            resp = qe.query(q2)
            print(f"\nQ: {q}\nQ_expanded: {q2}\n")
            print("Answer:\n", resp, "\n")
            print("Top sources:")
            for j, sn in enumerate(resp.source_nodes, 1):
                node = getattr(sn,"node",sn)
                name = getattr(node,"metadata",{}).get("file_name","(unknown)")
                score = round((getattr(sn,"score",0) or 0), 3)
                text = node.get_text() if hasattr(node,"get_text") else str(getattr(sn,"text",""))
                snippet = textwrap.shorten(text.replace("\n"," "), width=220)
                print(f"[{j}] score={score} file={name}\n    {snippet}")
            return str(resp), resp.source_nodes
        except Exception as e:
            if "429" in str(e) or "TooManyRequests" in str(e):
                sleep = base_sleep*(2**i)
                print(f"Rate limit: retrying in {sleep:.1f}s…")
                time.sleep(sleep)
            else:
                raise

def extract_amount(sources, keywords):
    """Pick a $ amount near given keywords from top sources."""
    joined=""
    for sn in sources[:3]:
        node = getattr(sn,"node",sn)
        joined += "\n" + (node.get_text() if hasattr(node,"get_text") else str(getattr(sn,"text","")))
    best=None
    for line in joined.splitlines():
        lo=line.lower()
        if any(k in lo for k in keywords):
            for m in re.findall(r"\$?\s?\d[\d,]*(?:\.\d{2})?", line):
                best=m
    if not best:
        m=re.search(r"\$?\s?\d[\d,]*(?:\.\d{2})?", joined)
        best=m.group(0) if m else None
    return best


In [None]:
# Prompt 1
ans1, src1 = ask_rag("What is the total estimated monthly payment?")
amt1 = extract_amount(src1, keywords=("monthly","payment","estimated","total"))
print("\nEstimated monthly payment (best evidence):", amt1)

# Prompt 2
ans2, src2 = ask_rag("How much does the borrower pay for lender's title insurance?")
amt2 = extract_amount(src2, keywords=("lender","title","insurance","premium"))
print("\nLender's title insurance (best evidence):", amt2)

# Final deliverables text (copy-paste into your submission form)
choices = (
  "- Embeddings: BAAI/bge-small-en-v1.5 — fast & strong for English retrieval.\n"
  "- Chunking: 1024 tokens, 200 overlap — balances recall with context.\n"
  "- Retrieval: Hybrid (BM25 + Vector, α=0.5) — exact fee names + semantic recall; cross-encoder reranker for precision."
)
print("\n" + "="*60)
print("Short Explanation of Design Choices:\n" + choices)
print("="*60)
print("\nResponse to Prompt 1:\n", ans1, f"\nEvidence amount: {amt1}")
print("\nResponse to Prompt 2:\n", ans2, f"\nEvidence amount: {amt2}")
print("\n(If an amount is None, the PDF likely doesn’t list it explicitly—your answer text + sources still count.)")


In [None]:
# Q1: total estimated monthly payment
ans1, src1 = ask_rag("What is the total estimated monthly payment?", expand=True)

print("\n=== Paste into 'Short Explanation 1' ===\n")
print(str(ans1))  # full AI-generated answer


In [None]:
# Q2: lender's title insurance amount
ans2, src2 = ask_rag("How much does the borrower pay for lender's title insurance?", expand=True)

print("\n=== Paste into 'Short Explanation 2' ===\n")
print(str(ans2))  # full AI-generated answer


In [None]:
def extract_amount(sources, keywords=()):
    import re
    text = "\n".join(
        (getattr(getattr(sn,'node',sn),'get_text',lambda:'' )() or str(getattr(sn,'text','')))
        for sn in sources[:3]
    )
    for line in text.splitlines():
        lo = line.lower()
        if any(k in lo for k in keywords):
            for m in re.findall(r"\$?\s?\d[\d,]*(?:\.\d{2})?", line):
                return m
    m = re.search(r"\$?\s?\d[\d,]*(?:\.\d{2})?", text)
    return m.group(0) if m else "Not specified"

print("Q1 amount:", extract_amount(src1, ("monthly","payment","estimated","total")))
print("Q2 amount:", extract_amount(src2, ("lender","title","insurance","premium")))


In [None]:
# 1) Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
# 2) List any .ipynb files in your Drive (first 3 levels)
!find "/content/drive/MyDrive" -maxdepth 3 -name "*.ipynb"


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# List ipynb files under Drive (up to 3 levels deep)
!find "/content/drive/MyDrive" -maxdepth 3 -name "*.ipynb"


In [None]:
# --- Make a GitHub-friendly copy of your Task II notebook ---

import os, nbformat as nbf
from google.colab import files

# 👉 Choose ONE of your real notebook paths from the list you printed:
SRC = "/content/drive/MyDrive/Colab Notebooks/Task2_RAG_with_outputs.ipynb"
# (Alternatives you listed, if you prefer one of these instead:)
# SRC = "/content/drive/MyDrive/Colab Notebooks/Task II.Build & Optimize A RAG Pipeline For Doc Retrieval.ipynb"
# SRC = "/content/drive/MyDrive/Colab Notebooks/Copy of Task II.Build & Optimize A RAG Pipeline For Doc Retrieval.ipynb"

assert os.path.exists(SRC), f"Missing: {SRC}"

DST = "/content/Task2_RAG_github.ipynb"
nb = nbf.read(SRC, as_version=4)

# Strip ipywidgets metadata so GitHub stops saying "Invalid Notebook"
for k in ("widgets","widget_state","widgetsState"):
    nb.metadata.pop(k, None)

KEEP_OUTPUTS = True   # set False if you want a smaller, clean notebook with no outputs
if not KEEP_OUTPUTS:
    for c in nb.cells:
        if c.get("cell_type") == "code":
            c["outputs"] = []
            c["execution_count"] = None

nbf.write(nb, DST)
print("Wrote:", DST)

# Download so you can upload it on GitHub
files.download(DST)


In [None]:
from google.colab import drive; drive.mount('/content/drive', force_remount=True)
import os, nbformat as nbf
from google.colab import files

# Use your real path from Drive listing (you showed this exists):
SRC = "/content/drive/MyDrive/Colab Notebooks/Task2_RAG_with_outputs.ipynb"
# If you prefer a different one, swap:
# SRC = "/content/drive/MyDrive/Colab Notebooks/Task II.Build & Optimize A RAG Pipeline For Doc Retrieval.ipynb"

assert os.path.exists(SRC), f"Missing: {SRC}"

DST = "/content/Task2_RAG_github.ipynb"
nb = nbf.read(SRC, as_version=4)

# --- Strip widget metadata at notebook root (if present)
for k in ("widgets", "widget_state", "widgetsState"):
    nb.metadata.pop(k, None)

# --- Remove widget/HTML upload outputs from each cell but KEEP normal stdout text
for cell in nb.cells:
    if cell.get("cell_type") == "code" and "outputs" in cell:
        cleaned = []
        for out in cell["outputs"]:
            # Keep plain text streams (your printed answers)
            if out.get("output_type") == "stream":
                cleaned.append(out)
                continue
            # Drop widget views and the big upload widget HTML
            data = out.get("data", {})
            if "application/vnd.jupyter.widget-view+json" in data:
                continue
            if "text/html" in data and "Upload widget" in "".join(data.get("text/html") or []):
                continue
            cleaned.append(out)
        cell["outputs"] = cleaned

    # Optional: remove cell-level widget references (cosmetic)
    md = cell.get("metadata", {})
    colab_md = md.get("colab", {})
    colab_md.pop("referenced_widgets", None)
    md["colab"] = colab_md
    cell["metadata"] = md

nbf.write(nb, DST)
print("Wrote cleaned notebook:", DST)
files.download(DST)
