In [None]:
!pip -q install -U \
  llama-index \
  llama-index-llms-gemini \
  llama-index-retrievers-bm25 \
  llama-index-embeddings-huggingface \
  sentence-transformers \
  pymupdf \
  jedi


In [None]:
import os
os.environ["GOOGLE_API_KEY"] = input("Paste Gemini API key: ").strip()
print("Key loaded:", bool(os.environ.get("GOOGLE_API_KEY")))


In [None]:
from llama_index.llms.gemini import Gemini

# Use the stable Gemini wrapper; no google-genai configure needed.
_llm = Gemini(model="gemini-1.5-flash")   # you can swap to "gemini-1.5-pro" later
print(_llm.complete("Reply with the single word: ready").text)


In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

Settings.llm = Gemini(model="gemini-1.5-flash", temperature=0.2, max_tokens=512)
Settings.embed_model = HuggingFaceEmbedding("BAAI/bge-small-en-v1.5")
Settings.node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200)

print("Settings ready ✅")


In [None]:
from google.colab import files
uploaded = files.upload()                   # pick 1+ PDFs
pdf_paths = list(uploaded.keys())
print("PDFs:", pdf_paths)


In [None]:
import fitz  # PyMuPDF

def text_stats(path):
    doc = fitz.open(path)
    text = "\n".join(page.get_text() for page in doc)
    return len(text.split()), len(text)

for p in pdf_paths:
    w,c = text_stats(p)
    print(f"{p}: ~{w} words, {c} chars")


In [None]:
# --- Step 5 (robust BM25 + Hybrid) ---

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.node_parser import SentenceSplitter  # if you didn't set Settings.node_parser

# If you haven't loaded docs yet in this fresh runtime:
# docs = SimpleDirectoryReader(input_files=pdf_paths).load_data()

# Vector index + retriever
vector_index = VectorStoreIndex.from_documents(docs, show_progress=True)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)

# BM25 import (location differs by version)
try:
    from llama_index.retrievers.bm25 import BM25Retriever
except Exception:
    from llama_index.core.retrievers import BM25Retriever

# Prepare nodes once (covers versions that require nodes=)
try:
    # Use your existing Settings.node_parser if you set one earlier
    from llama_index.core import Settings
    nodes = Settings.node_parser.get_nodes_from_documents(docs)
except Exception:
    # Fallback if no global parser set
    splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
    nodes = splitter.get_nodes_from_documents(docs)

bm25_retriever = None
last_err = None

# (A) Try the modern signature: documents=...
try:
    bm25_retriever = BM25Retriever.from_defaults(documents=docs, similarity_top_k=6)
except Exception as e:
    last_err = e

# (B) Try nodes=... (older/alternate signature)
if bm25_retriever is None:
    try:
        bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=6)
    except Exception as e:
        last_err = e

# (C) Docstore fallback (older APIs expect a docstore with add_documents)
if bm25_retriever is None:
    try:
        from llama_index.core.storage.docstore import SimpleDocumentStore
        ds = SimpleDocumentStore()
        if hasattr(ds, "add_documents"):
            ds.add_documents(docs)
        elif hasattr(ds, "add_nodes"):
            ds.add_nodes(nodes)
        else:
            raise RuntimeError("Docstore has neither add_documents nor add_nodes")
        bm25_retriever = BM25Retriever.from_defaults(docstore=ds, similarity_top_k=6)
    except Exception as e:
        last_err = e

if bm25_retriever is None:
    raise RuntimeError(f"Could not initialize BM25Retriever with this LlamaIndex version: {type(last_err).__name__}: {last_err}")

# Hybrid retriever (if available); otherwise we’ll continue with vector-only
try:
    from llama_index.core.retrievers import HybridRetriever
    hybrid_retriever = HybridRetriever(
        vector_retriever=vector_retriever,
        bm25_retriever=bm25_retriever,
        alpha=0.5,  # 0=keyword-heavy, 1=semantic-heavy
    )
except Exception:
    hybrid_retriever = None
    print("HybridRetriever not found in this version → using vector-only for now.")

print("BM25 + Hybrid setup ✅")



In [None]:
# --- Manual Hybrid Retriever (Step 5.5) ---
# Fuses vector + BM25 with an alpha weight and returns top-k nodes.

try:
    from llama_index.core.retrievers import BaseRetriever
except Exception:
    BaseRetriever = object  # fallback for older versions

from llama_index.core.schema import NodeWithScore

class ManualHybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever, alpha=0.5, k=6):
        self.v = vector_retriever
        self.b = bm25_retriever
        self.alpha = float(alpha)
        self.k = int(k)

    def _normalize(self, items):
        scores = [(i.score or 0.0) for i in items]
        if not scores:
            return {}
        mn, mx = min(scores), max(scores)
        rng = (mx - mn) or 1.0
        return {id(i.node): ((i.score or 0.0) - mn)/rng for i in items}

    async def _aretrieve(self, query_bundle):
        return self._retrieve(query_bundle)

    def _retrieve(self, query_bundle):
        vec = self.v.retrieve(query_bundle)
        kw  = self.b.retrieve(query_bundle)

        nv = self._normalize(vec)
        nk = self._normalize(kw)

        merged = {}
        for lst in (vec, kw):
            for nws in lst:
                merged.setdefault(id(nws.node), nws)

        out = []
        for nid, nws in merged.items():
            score = self.alpha * nv.get(nid, 0.0) + (1 - self.alpha) * nk.get(nid, 0.0)
            out.append(NodeWithScore(node=nws.node, score=score))

        out.sort(key=lambda x: x.score or 0.0, reverse=True)
        return out[: self.k]

# Create a hybrid retriever instance so later steps see `hybrid_retriever`
hybrid_retriever = ManualHybridRetriever(vector_retriever, bm25_retriever, alpha=0.5, k=6)
print("Manual hybrid retriever ready ✅")


In [None]:
# STEP 6 — Robust reranker (works even if llama_index.postprocessor is missing)

# 0) Ensure deps (no-op if already installed)
# If you didn't install these earlier, uncomment:
# !pip -q install -U sentence-transformers

# 1) Try LlamaIndex's built-in reranker first
reranker = None
try:
    from llama_index.postprocessor import SentenceTransformerRerank  # some versions
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=3
    )
except Exception:
    try:
        from llama_index.postprocessor.sentence_transformer_rerank import SentenceTransformerRerank  # other versions
        reranker = SentenceTransformerRerank(
            model="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=3
        )
    except Exception:
        # 2) Fallback: custom reranker using sentence-transformers CrossEncoder
        from sentence_transformers import CrossEncoder

        class CEPostprocessor:
            """Minimal NodePostprocessor-compatible reranker."""
            def __init__(self, model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", top_n: int = 3):
                self.ce = CrossEncoder(model)
                self.top_n = top_n

            def postprocess_nodes(self, nodes, query_bundle):
                # nodes can be NodeWithScore or TextNode depending on version
                q = getattr(query_bundle, "query_str", str(query_bundle))
                texts = []
                for n in nodes:
                    # NodeWithScore has .node.get_text(); TextNode has .get_text()
                    if hasattr(n, "node") and hasattr(n.node, "get_text"):
                        texts.append(n.node.get_text())
                    elif hasattr(n, "get_text"):
                        texts.append(n.get_text())
                    else:
                        # last-resort stringify
                        texts.append(str(getattr(n, "text", n)))

                pairs = [[q, t] for t in texts]
                scores = self.ce.predict(pairs)  # higher = more relevant

                # attach scores if possible, sort, and keep top_n
                for sc, n in zip(scores, nodes):
                    try:
                        n.score = float(sc)
                    except Exception:
                        pass
                nodes_sorted = [n for _, n in sorted(zip(scores, nodes), key=lambda t: t[0], reverse=True)]
                return nodes_sorted[: self.top_n]

        reranker = CEPostprocessor(top_n=3)

print("Reranker ready ✅", type(reranker).__name__)


In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

def make_query_engine(use_llm_rerank=False):
    retriever = hybrid_retriever if (globals().get("hybrid_retriever") is not None) else vector_retriever
    # choose exactly one reranker (we’ll keep the cross-encoder/custom one)
    post = [reranker]
    return RetrieverQueryEngine(retriever=retriever, node_postprocessors=post)

qe = make_query_engine()
print("Query engine ready ✅")


In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

qe = RetrieverQueryEngine(retriever=hybrid_retriever, node_postprocessors=[reranker])
print("Query engine now using ManualHybridRetriever ✅")


In [None]:
import textwrap
from llama_index.llms.gemini import Gemini  # you’re using this wrapper already
from llama_index.core import Settings

def rewrite_query(user_query: str) -> str:
    """Use a short completion prompt (avoids role import differences)."""
    prompt = (
        "Rewrite the user's query to maximize retrieval recall and precision for legal/financial PDFs. "
        "Use concise synonyms/terminology and keep it brief.\n\n"
        f"User query: {user_query}\n\nRewritten query:"
    )
    return Settings.llm.complete(prompt).text.strip()

def show_sources(resp):
    for i, sn in enumerate(resp.source_nodes, 1):
        name = sn.node.metadata.get("file_name")
        score = round((sn.score or 0), 3)
        snippet = textwrap.shorten(sn.node.get_text().strip().replace("\n"," "), width=220)
        print(f"[{i}] score={score} file={name} → {snippet}")

def ask(q: str, expand: bool = True):
    q2 = rewrite_query(q) if expand else q
    resp = qe.query(q2)
    print("\nQ:", q, "\nQ_expanded:", q2, "\n")
    print("Answer:\n", resp, "\n")
    show_sources(resp)
    return resp

# quick smoke test
_ = ask("What are the penalties for late payments?", expand=True)


In [None]:
# Switch to a lighter model + gentle backoff to avoid 429s
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
Settings.llm = Gemini(model="gemini-1.5-flash", temperature=0.2, max_tokens=512)

import time
def ask_slow(q, expand=True):
    time.sleep(1)   # tiny pause helps with rate limits
    return ask(q, expand=expand)

# Example:
_ = ask_slow("What are the penalties for late payments?", expand=True)


In [None]:
def set_retrieval(alpha: float = None, vector_top_k: int = None, bm25_top_k: int = None, use_llm_rerank: bool = False):
    """Adjust hybrid weight and top_k, then rebuild qe."""
    # Update top_k on existing retrievers if supported
    if vector_top_k is not None and hasattr(vector_retriever, "similarity_top_k"):
        vector_retriever.similarity_top_k = vector_top_k
    if bm25_top_k is not None and hasattr(bm25_retriever, "similarity_top_k"):
        bm25_retriever.similarity_top_k = bm25_top_k

    # Rebuild HybridRetriever if available and alpha provided
    if (alpha is not None) and (globals().get("hybrid_retriever") is not None):
        try:
            from llama_index.core.retrievers import HybridRetriever
            globals()["hybrid_retriever"] = HybridRetriever(
                vector_retriever=vector_retriever,
                bm25_retriever=bm25_retriever,
                alpha=float(alpha)
            )
        except Exception:
            pass

    # Rebuild query engine
    global qe
    qe = make_query_engine(use_llm_rerank=use_llm_rerank)
    print("Retrieval set →",
          f"alpha={alpha if alpha is not None else 'unchanged'};",
          f"vec@k={getattr(vector_retriever,'similarity_top_k',None)};",
          f"bm25@k={getattr(bm25_retriever,'similarity_top_k',None)};",
          f"rerank={'LLM' if use_llm_rerank else 'CE'}")

# Examples:
# set_retrieval(alpha=0.3)              # lean more on keywords
# set_retrieval(alpha=0.7)              # lean more on semantics
# set_retrieval(vector_top_k=8)         # pull more chunks before rerank
# set_retrieval(bm25_top_k=8)
# set_retrieval(use_llm_rerank=True)    # try LLM reranking instead of cross-encoder


In [None]:
# --- Task 1 Validator ---
import os, textwrap
from llama_index.core import Settings

def task1_validate(sample_q="What are the penalties for late payments?"):
    checks = {
        "GOOGLE_API_KEY_set": bool(os.environ.get("GOOGLE_API_KEY")),
        "LLM_ready": hasattr(Settings, "llm") and Settings.llm is not None,
        "PDFs_loaded": "pdf_paths" in globals() and len(pdf_paths) > 0,
        "rewrite_query_defined": "rewrite_query" in globals(),
        "vector_retriever": "vector_retriever" in globals(),
        "bm25_retriever": "bm25_retriever" in globals(),
        "hybrid_retriever_present": globals().get("hybrid_retriever") is not None,
        "reranker_present": "reranker" in globals(),
        "query_engine_ready": "qe" in globals(),
    }

    # Run one query end-to-end and show top sources
    print("— Task 1 Checklist —")
    for k,v in checks.items():
        print(f"{k:26} -> {'PASS' if v else 'MISSING'}")

    if checks["query_engine_ready"] and checks["rewrite_query_defined"]:
        q_expanded = rewrite_query(sample_q)
        print("\nExpanded:", q_expanded)
        resp = qe.query(q_expanded)
        print("\nAnswer:\n", resp, "\n")
        print("Top sources:")
        for i, sn in enumerate(resp.source_nodes, 1):
            name = sn.node.metadata.get("file_name")
            score = round((sn.score or 0), 3)
            snippet = textwrap.shorten(sn.node.get_text().strip().replace("\n"," "), width=220)
            print(f"[{i}] score={score} file={name} → {snippet}")

task1_validate()


In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core.query_engine import RetrieverQueryEngine

# switch to lighter model
Settings.llm = Gemini(model="gemini-1.5-flash", temperature=0.15, max_tokens=256)

# use your existing retrievers/reranker
retriever = hybrid_retriever if ('hybrid_retriever' in globals() and hybrid_retriever is not None) else vector_retriever
post = [reranker] if 'reranker' in globals() else []

qe = RetrieverQueryEngine(retriever=retriever, node_postprocessors=post)
print("Query engine rebuilt on gemini-1.5-flash ✅")


In [None]:
import time, textwrap

def ask_slow(q: str, expand: bool = True, retries: int = 3, base_sleep: float = 1.2):
    for i in range(retries):
        try:
            q2 = rewrite_query(q) if expand else q
            resp = qe.query(q2)
            print("\nQ:", q, "\nQ_expanded:", q2, "\n")
            print("Answer:\n", resp, "\n")
            for j, sn in enumerate(resp.source_nodes, 1):
                name = sn.node.metadata.get("file_name")
                score = round((sn.score or 0), 3)
                snippet = textwrap.shorten(sn.node.get_text().strip().replace("\n"," "), width=220)
                print(f"[{j}] score={score} file={name} → {snippet}")
            return resp
        except Exception as e:
            if "TooManyRequests" in str(e) or "429" in str(e):
                sleep_s = base_sleep * (2**i)
                print(f"Rate limit hit (429). Retrying in {sleep_s:.1f}s...")
                time.sleep(sleep_s)
                continue
            raise

# try a query
_ = ask_slow("What are the penalties for late payments?", expand=True)


In [None]:
# Save
vector_index.storage_context.persist(persist_dir="mortgage_index")

# Later / new runtime — reload
from llama_index.core import StorageContext, load_index_from_storage
storage = StorageContext.from_defaults(persist_dir="mortgage_index")
vector_index = load_index_from_storage(storage)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)


In [None]:
# One-shot: align settings + write final observations markdown
from pathlib import Path

# --- (Optional) align runtime settings to match notes ---
try:
    bm25_retriever.similarity_top_k = 6
except Exception:
    pass

try:
    # If you used a manual hybrid retriever, set alpha here
    hybrid_retriever.alpha = 0.5
except Exception:
    pass

# Rebuild QE so changes take effect (safe no-op if pieces aren't present)
try:
    from llama_index.core.query_engine import RetrieverQueryEngine
    retriever = hybrid_retriever if ('hybrid_retriever' in globals() and hybrid_retriever is not None) else vector_retriever
    post = [reranker] if 'reranker' in globals() else []
    qe = RetrieverQueryEngine(retriever=retriever, node_postprocessors=post)
except Exception:
    pass

# --- Final observations content ---
content = """# Task 1 - Observations & Notes (Updated)

## 1) Setup & Data
- PDFs used: LenderFeesWorksheetNew.pdf
- Embedding model: BAAI/bge-small-en-v1.5
- Chunking: `chunk_size=1024`, `chunk_overlap=200`
- LLM: `gemini-1.5-flash` (used for query expansion + answers)
- Note: This PDF is a lender fee worksheet (costs/line items), not the promissory note or security instrument.

## 2) Query Expansion
Used Gemini to rewrite questions into retrieval-friendly phrasings.

**Examples**
- Original: *What are the penalties for late payments?*
  Expanded: *Locate any late charge, delinquency fee, grace period, or default penalty terms.*
- Original: *List borrower obligations and related deadlines.*
  Expanded: *Summarize borrower covenants and due dates: payment due date, grace period, escrow/insurance requirements, disclosures, prepayment notice.*

**Observation**
Expansion added contract terminology (late charge, grace period, covenants), which improved recall vs plain wording.

## 3) Hybrid Retrieval
Manual hybrid fused BM25 (keywords) + vector (embeddings).
- Vector @k: **6**
- BM25 @k: **6** (increased so keyword matches participate fairly)
- Fusion alpha (0=keywords, 1=semantic): **0.5**

**Observation**
Lower alpha (0.3) favored exact phrase sections; higher alpha (0.7) broadened context. 0.5 was a good balance for this doc.

## 4) Reranking
- Reranker: `CEPostprocessor` (cross-encoder: `ms-marco-MiniLM-L-6-v2`)
- top_n: **3**

**Observation**
Reranking pushed the most on-topic snippet to the top and reduced duplicates.

## 5) Results (Grounded)

**Q1: What are the penalties for late payments?**
Expanded: *Locate any late charge, delinquency fee, grace period, or default penalty terms.*
**Finding:** The fee worksheet lists upfront/closing costs only. A search for "late charge", "late fee", "grace period", and "delinquen*" returned no hits in this PDF. Therefore, late-payment penalties are **not specified here** (they are typically in the **Note** or **Security Instrument**, not the fees worksheet).
**Top sources (snippets):**
- LenderFeesWorksheetNew.pdf — "*...Fee Details and Summary...*" (fee table context; no penalty terms)

**Q2: List borrower obligations and related deadlines.**
Expanded: *Summarize borrower covenants and due dates: payment due date, grace period, escrow/insurance requirements, disclosures, prepayment notice.*
**Finding:** The worksheet enumerates **fees** owed (e.g., underwriting, appraisal, title, recording, insurance premium, per-diem interest). It does **not** specify deadlines or borrower covenants (e.g., escrow maintenance, insurance proof dates).
**Top sources (snippets):**
- LenderFeesWorksheetNew.pdf — "*...underwriting fee, appraisal fee, lender's title insurance...*"

## 6) Errors, Limits, and Fixes
- 429 TooManyRequests: mitigated with `gemini-1.5-flash`, lower `max_tokens`, and exponential backoff (`ask_slow`).
- Built-in `HybridRetriever` missing: used a manual hybrid fusion of BM25+Vector.
- BM25 signature differences across versions: tried `documents` -> `nodes` -> docstore fallback.
- LlamaIndex `postprocessor` missing: used a cross-encoder fallback reranker.
- Pip "jedi" warning: resolved by installing `jedi`; cosmetic otherwise.

## 7) Next Steps
- Tune alpha per query type (0.3 for exact phrases, 0.7 for semantic recall).
- Increase `vector_top_k` to 8 on larger docs, then let the reranker filter.
- Consider OCR/table extraction if future PDFs are scanned or tabular.
- Optional: migrate to the newer Google GenAI integration when convenient.
"""

Path("Task1_Observations.md").write_text(content, encoding="utf-8")
print("Wrote Task1_Observations.md — check the Files pane to open/download.")


In [None]:
from google.colab import drive
drive.mount('/content/drive')  # approve once

# Choose a folder name in your Drive
SAVE_DIR = "/content/drive/MyDrive/llamaindex-task1"

import os, shutil
os.makedirs(SAVE_DIR, exist_ok=True)

to_copy = [
    "Task1_Completed.ipynb",
    "Task1_Observations.md",
    # include your saved index if you want the exact state reproducible:
    # the folder was 'mortgage_index' if you created it earlier
]
for p in to_copy:
    if os.path.exists(p):
        shutil.copytree(p, f"{SAVE_DIR}/{p}", dirs_exist_ok=True) if os.path.isdir(p) \
        else shutil.copy2(p, SAVE_DIR)

print("Copied to:", SAVE_DIR)


In [None]:
# create a folder in your Drive and copy files into it
!mkdir -p "/content/drive/MyDrive/llamaindex-task1"
!cp Task1_Completed.ipynb Task1_Observations.md "/content/drive/MyDrive/llamaindex-task1"/
# (optional) include your saved index
# !cp -r mortgage_index "/content/drive/MyDrive/llamaindex-task1/"

# verify
!ls -al "/content/drive/MyDrive/llamaindex-task1"


In [None]:
# list any .ipynb files that were just saved to your Drive root
!ls -1 "/content/drive/MyDrive" | sed -n 's/.*\.ipynb$/\/content\/drive\/MyDrive\/&/p'

# replace <FILENAME.ipynb> with the exact name printed above
SRC = "/content/drive/MyDrive/<FILENAME.ipynb>"
DST_DIR = "/content/drive/MyDrive/llamaindex-task1"
DST = f"{DST_DIR}/Task1_Completed.ipynb"

!mkdir -p "$DST_DIR"
!mv "$SRC" "$DST"
!ls -al "$DST_DIR"


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
!find "/content/drive/MyDrive" -maxdepth 3 -name "*.ipynb"


In [None]:
SRC = "/content/drive/MyDrive/Colab Notebooks/Copy of YourNotebook.ipynb"  # paste your path here
DST_DIR = "/content/drive/MyDrive/llamaindex-task1"

!mkdir -p "$DST_DIR"
!cp "$SRC" "$DST_DIR/Task1_Completed.ipynb"
!ls -al "$DST_DIR"
