
# PLP Backend (Colab Edition): Load `corpus_docs.jsonl` from Google Drive and Query

This notebook is **Colab-ready**. It:
- Mounts Google Drive and loads your curated corpus at:  
  `My Drive/Project_3/data/corpus_docs.jsonl`
- Builds a lightweight **TF–IDF → cosine** retrieval index
- Exposes a clean `answer(query, k=4, show_steps=True)` API with citations and optional CoT-style visible reasoning
- Includes quick smoke tests and a small interactive loop

> Edit the path in **Step 2** if your folder name differs.


## Step 0 — Install Dependencies

In [None]:
!pip -q install scikit-learn

## Step 1 — Mount Google Drive

In [None]:

from google.colab import drive
drive.mount('/content/drive')
print("Drive mounted.")



## Step 2 — Configure Paths

Default target:
```
My Drive/Project_3/data/corpus_docs.jsonl
```


In [None]:

from pathlib import Path
CORPUS_JSONL = Path('/content/drive/My Drive/Project_3/data/corpus_docs.jsonl')
print("Using corpus file:", CORPUS_JSONL)
assert CORPUS_JSONL.exists(), f"Corpus file not found at: {CORPUS_JSONL}"


## Step 3 — Backend: Data Types, Index, and API

In [None]:

import json, re, textwrap
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

@dataclass
class Chunk:
    doc_id: str
    chunk_id: str
    title: str
    url: str
    text: str

def simple_sentence_split(text: str):
    import re
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sents if s]

def build_chunks(docs: List[Dict[str, str]], max_sent_per_chunk: int = 2) -> List[Chunk]:
    chunks: List[Chunk] = []
    for d in docs:
        doc_id = d["id"]
        title = d.get("title","")
        url = d.get("url","")
        text = d.get("text","").strip()
        if not text:
            continue
        sents = simple_sentence_split(text)
        for i in range(0, len(sents), max_sent_per_chunk):
            piece = " ".join(sents[i:i+max_sent_per_chunk])
            chunks.append(Chunk(
                doc_id=doc_id,
                chunk_id=f"{doc_id}::ch{i//max_sent_per_chunk}",
                title=title,
                url=url,
                text=piece
            ))
    return chunks

class TfidfIndex:
    def __init__(self, chunks: List[Chunk]):
        self.chunks = chunks
        self.vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
        self.matrix = self.vectorizer.fit_transform([c.text for c in chunks])
    def query(self, q: str, top_k: int = 5) -> List[Tuple[Chunk, float]]:
        qv = self.vectorizer.transform([q])
        sims = cosine_similarity(qv, self.matrix)[0]
        idx = np.argsort(-sims)[:top_k]
        return [(self.chunks[i], float(sims[i])) for i in idx]

def cot_plan(query: str):
    return [
        "1) Identify the finance concept(s) asked.",
        "2) Retrieve definitions/formulas and usage conditions.",
        "3) Cross-check top chunks for consistency and specificity.",
        "4) Compose a concise, grounded answer with citations."
    ]

def synthesize_answer(query: str, retrieved, show_steps=True):
    bullets = []
    for ch, score in retrieved:
        import textwrap
        snippet = textwrap.shorten(ch.text, width=220, placeholder="…")
        bullets.append(f"- [{ch.doc_id}] {snippet} (score={score:.3f})")
    ql = query.lower()
    lines = []
    if "wacc" in ql or "cost of capital" in ql:
        lines.append("WACC = E/V·Re + D/V·Rd·(1−Tc). Use it when project risk is similar to the firm’s core assets.")
    if "npv" in ql or "irr" in ql:
        lines.append("NPV = Σ CFt/(1+r)^t − initial cost; accept if NPV>0. IRR sets NPV=0; best for conventional cash flows and comparable scales.")
    if not lines:
        lines.append(textwrap.shorten(" ".join([ch.text for ch,_ in retrieved]), width=500, placeholder="…"))
    citations = [{"doc_id": ch.doc_id, "chunk_id": ch.chunk_id, "url": ch.url} for ch,_ in retrieved]
    out = {"query": query, "answer": " ".join(lines), "citations": citations}
    if show_steps:
        out["reasoning_steps"] = cot_plan(query) + bullets
    return out

_index = None
_chunks = None

def answer(query: str, k: int = 4, show_steps: bool=True):
    assert _index is not None, "Index not built. Run the ingestion cell first."
    retrieved = _index.query(query, top_k=k)
    return synthesize_answer(query, retrieved, show_steps=show_steps)


## Step 4 — Load Corpus JSONL and Build Index

In [None]:

def load_jsonl(path: Path, limit=None) -> List[Dict[str,str]]:
    docs = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            rec = json.loads(line)
            docs.append({
                "id": rec["id"],
                "title": rec.get("title",""),
                "url": rec.get("url",""),
                "text": rec.get("text","")
            })
    return docs

docs = load_jsonl(CORPUS_JSONL)
print(f"Loaded {len(docs)} documents")

_chunks = build_chunks(docs, max_sent_per_chunk=2)
print("Chunks:", len(_chunks))

_index = TfidfIndex(_chunks)
print("Index built.")


## Step 5 — Smoke Tests

In [None]:

demo1 = answer("How do I compute WACC and when should I use it as a discount rate?")
demo2 = answer("Compare NPV and IRR and mention a limitation of payback.", show_steps=False)
demo1, demo2


## Step 6 — Optional Interactive Loop

In [None]:

# while True:
#     q = input("Ask a finance question (or 'quit'): ").strip()
#     if q.lower() in {"quit","exit"}:
#         break
#     resp = answer(q, k=4, show_steps=True)
#     print("\nAnswer:", resp["answer"])
#     print("Citations:", json.dumps(resp["citations"], indent=2))
#     if "reasoning_steps" in resp:
#         print("Reasoning:")
#         for s in resp["reasoning_steps"]:
#             print("-", s)
