
# PLP Frontend (Colab Edition): Gradio/Streamlit UI for the Corporate-Finance Backend

Works with your Colab backend corpus at `My Drive/Project_3/data/corpus_docs.jsonl`.

- **Gradio UI** (in-notebook)
- **Streamlit app** (optional, via ngrok)

Shared backend API: `init_backend(corpus_jsonl_path)`, `answer(query, k=4, show_steps=True)`.


## Step 0 — Install Dependencies

In [None]:
!pip -q install scikit-learn gradio streamlit pyngrok==4.1.1

## Step 1 — Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("Drive mounted.")

## Step 2 — Write Shared Backend Module (`plp_backend_colab.py`)

In [None]:

code = r'''
# plp_backend_colab.py
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple
import json, re, textwrap
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

@dataclass
class Chunk:
    doc_id: str
    chunk_id: str
    title: str
    url: str
    text: str

_chunks = None
_index = None

def _simple_sentence_split(text: str):
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sents if s]

def _build_chunks(docs: List[Dict[str,str]], max_sent_per_chunk: int = 2) -> List[Chunk]:
    chunks: List[Chunk] = []
    for d in docs:
        doc_id = d["id"]
        title = d.get("title","")
        url = d.get("url","")
        text = d.get("text","").strip()
        if not text:
            continue
        sents = _simple_sentence_split(text)
        for i in range(0, len(sents), max_sent_per_chunk):
            piece = " ".join(sents[i:i+max_sent_per_chunk])
            chunks.append(Chunk(
                doc_id=doc_id,
                chunk_id=f"{doc_id}::ch{i//max_sent_per_chunk}",
                title=title,
                url=url,
                text=piece
            ))
    return chunks

class _TfidfIndex:
    def __init__(self, chunks: List[Chunk]):
        self.chunks = chunks
        self.vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
        self.matrix = self.vectorizer.fit_transform([c.text for c in chunks])
    def query(self, q: str, top_k: int = 5) -> List[Tuple[Chunk, float]]:
        qv = self.vectorizer.transform([q])
        sims = cosine_similarity(qv, self.matrix)[0]
        idx = np.argsort(-sims)[:top_k]
        return [(self.chunks[i], float(sims[i])) for i in idx]

def _cot_plan(query: str):
    return [
        "1) Identify the finance concept(s) asked.",
        "2) Retrieve definitions/formulas and usage conditions.",
        "3) Cross-check top chunks for consistency and specificity.",
        "4) Compose a concise, grounded answer with citations."
    ]

def _synthesize_answer(query: str, retrieved, show_steps=True):
    bullets = []
    for ch, score in retrieved:
        snippet = textwrap.shorten(ch.text, width=220, placeholder="…")
        bullets.append(f"- [{ch.doc_id}] {snippet} (score={score:.3f})")
    ql = query.lower()
    lines = []
    if "wacc" in ql or "cost of capital" in ql:
        lines.append("WACC = E/V·Re + D/V·Rd·(1−Tc). Use it when project risk is similar to the firm’s core assets.")
    if "npv" in ql or "irr" in ql:
        lines.append("NPV = Σ CFt/(1+r)^t − initial cost; accept if NPV>0. IRR sets NPV=0; best for conventional cash flows and comparable scales.")
    if not lines:
        lines.append(textwrap.shorten(" ".join([ch.text for ch,_ in retrieved]), width=500, placeholder="…"))
    citations = [{"doc_id": ch.doc_id, "chunk_id": ch.chunk_id, "url": ch.url} for ch,_ in retrieved]
    out = {"query": query, "answer": " ".join(lines), "citations": citations}
    if show_steps:
        out["reasoning_steps"] = _cot_plan(query) + bullets
    return out

def init_backend(corpus_jsonl_path: str, max_sent_per_chunk: int = 2) -> int:
    global _chunks, _index
    docs: List[Dict[str,str]] = []
    p = Path(corpus_jsonl_path)
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            docs.append({
                "id": rec["id"],
                "title": rec.get("title",""),
                "url": rec.get("url",""),
                "text": rec.get("text","")
            })
    _chunks = _build_chunks(docs, max_sent_per_chunk=max_sent_per_chunk)
    _index = _TfidfIndex(_chunks)
    return len(_chunks)

def answer(query: str, k: int = 4, show_steps: bool=True):
    assert _index is not None, "Backend not initialized. Call init_backend()."
    retrieved = _index.query(query, top_k=k)
    return _synthesize_answer(query, retrieved, show_steps=show_steps)
'''
open('/content/plp_backend_colab.py','w').write(code)
print("Wrote /content/plp_backend_colab.py")


## Step 3 — Initialize Backend with Drive Corpus

In [None]:
from plp_backend_colab import init_backend, answer
CORPUS_JSONL = '/content/drive/My Drive/Project_3/data/corpus_docs.jsonl'  # change if needed
n_chunks = init_backend(CORPUS_JSONL, max_sent_per_chunk=2)
print("Backend ready. Chunks:", n_chunks)
print(answer("What is WACC?", show_steps=False)["answer"])

## Step 4 — Gradio UI (in-notebook)

In [None]:

import json, gradio as gr
from plp_backend_colab import answer

def gradio_answer(q, k, show):
    if not q or not q.strip():
        return "Please enter a question.", ""
    resp = answer(q, k=int(k), show_steps=bool(show))
    cites = json.dumps(resp["citations"], indent=2)
    reasoning = "\n".join(resp.get("reasoning_steps", []))
    out = resp["answer"]
    if show and reasoning:
        out += "\n\n-- Visible Reasoning --\n" + reasoning
    return out, cites

with gr.Blocks() as demo:
    gr.Markdown("# 📘 PLP – Corporate Finance (Gradio)")
    q = gr.Textbox(label="Question", placeholder="How do I compute WACC and when to use it?")
    k = gr.Slider(2, 8, value=4, step=1, label="Top-k")
    show = gr.Checkbox(label="Show Reasoning (CoT-style)", value=True)
    btn = gr.Button("Ask")
    ans = gr.Textbox(label="Answer", lines=8)
    cites = gr.Textbox(label="Citations (JSON)", lines=8)
    btn.click(fn=gradio_answer, inputs=[q,k,show], outputs=[ans,cites])

# Uncomment to launch:
# demo.launch()


## Step 5 — Optional: Streamlit + ngrok

In [None]:

# Write Streamlit app file
app = r'''
import streamlit as st, json
from plp_backend_colab import answer

st.set_page_config(page_title="PLP – Corporate Finance", page_icon="📘", layout="wide")
st.title("📘 PLP – Corporate Finance (Streamlit)")

k = st.sidebar.slider("Top-k Chunks", 2, 8, 4, 1)
show = st.sidebar.checkbox("Show Reasoning (CoT-style)", value=True)

q = st.text_input("Enter your question:", placeholder="e.g., Compare NPV and IRR")
if st.button("Ask"):
    if not q.strip():
        st.warning("Please enter a question.")
    else:
        resp = answer(q, k=k, show_steps=show)
        st.subheader("Answer")
        st.write(resp["answer"])
        st.subheader("Citations")
        st.json(resp["citations"])
        if show and "reasoning_steps" in resp:
            st.subheader("Reasoning (Visible)")
            for step in resp["reasoning_steps"]:
                st.markdown(f"- {step}")
'''
open('/content/plp_streamlit_app_colab.py','w').write(app)
print("Wrote /content/plp_streamlit_app_colab.py")

# Optional: start Streamlit via ngrok (uncomment to run)
# from pyngrok import ngrok
# public_url = ngrok.connect(8501).public_url
# print("Public URL:", public_url)
# !streamlit run /content/plp_streamlit_app_colab.py --server.port 8501 --server.address 0.0.0.0



## How to Link Frontend & Backend

1. Run **Steps 0–3** to install deps, mount Drive, and initialize the backend with your JSONL corpus.
2. **Gradio** (Step 4): uncomment `demo.launch()` and run to get a local URL.
3. **Streamlit** (Step 5): write the app file; optionally start an ngrok tunnel and run Streamlit.

Both UIs import `answer()` from `plp_backend_colab.py`. You can swap TF–IDF for embeddings or plug in an LLM
later without changing UI code, as long as `answer()` keeps returning `{answer, citations, reasoning_steps?}`.
