
## 1) Install dependencies


In [2]:

%%bash
pip -q install --upgrade duckduckgo_search httpx trafilatura tiktoken json5 rich litellm tenacity python-dotenv pydantic==2.*


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.9/42.9 kB 4.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 87.7/87.7 kB 8.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 462.4/462.4 kB 34.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 103.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.6/132.6 kB 16.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 243.4/243.4 kB 28.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.2/10.2 MB 130.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 278.1/278.1 kB 31.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 837.9/837.9 kB 68.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.3/3.3 MB 125.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 315.5/315.5 kB 32.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 274.7/274.7 kB 28.5 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-adk 1.17.0 requires tenacity<9.0.0,>=8.0.0, but you have tenacity 9.1.2 which is incompatible.
gradio 5.49.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.3 which is incompatible.
bigframes 2.26.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.


In [3]:

import os, re, math, time, json, json5, traceback, hashlib
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Tuple
from tenacity import retry, stop_after_attempt, wait_random_exponential
from duckduckgo_search import DDGS
import httpx
import trafilatura
from trafilatura.settings import use_config
import tiktoken
from pydantic import BaseModel, Field
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.markdown import Markdown
from rich import box
from datetime import datetime
from dotenv import load_dotenv

# Load any keys injected in the environment (e.g., via Colab's secrets)
load_dotenv()

console = Console()

# ---- Model selection (edit this) ----
MODEL_NAME = os.environ.get("DEEP_RESEARCH_MODEL", "gpt-4o-mini")  # Example: "gpt-4o-mini" or "claude-3-5-sonnet-20241022" or "openrouter/auto"

# ---- Scratchpad budget (tokens) ----
SCRATCHPAD_TOKEN_LIMIT = int(os.environ.get("SCRATCHPAD_TOKEN_LIMIT", "6000"))

# ---- Search parameters ----
MAX_RESULTS_PER_QUERY = int(os.environ.get("MAX_RESULTS_PER_QUERY", "5"))
FETCH_TIMEOUT = float(os.environ.get("FETCH_TIMEOUT", "15.0"))

# ---- Final report parameters ----
TARGET_REPORT_WORDS = int(os.environ.get("TARGET_REPORT_WORDS", "1200"))

# ---- Agent depth ----
MAX_DEPTH = int(os.environ.get("MAX_DEPTH", "4"))

# ---- Safe search ----
SAFESEARCH = os.environ.get("SAFESEARCH", "Moderate")  # "Off", "Moderate", "Strict"
REGION = os.environ.get("REGION", "us-en")

# ---- Provider abstraction via LiteLLM ----
from litellm import completion

def llm_generate(messages: List[Dict[str, str]],
                 model: str = MODEL_NAME,
                 temperature: float = 0.2,
                 max_tokens: int = 1200) -> str:
    """Unified chat completion via LiteLLM (works with many providers).
    Expects messages=[{"role":"system","content":...}, {"role":"user","content":...}, ...].
    Returns the assistant text.
    """
    try:
        resp = completion(model=model, messages=messages, temperature=temperature, max_tokens=max_tokens)
        # LiteLLM normalizes outputs
        return resp.choices[0].message["content"]
    except Exception as e:
        raise RuntimeError(f"LLM call failed: {e}")


# ---- Tokenization utilities ----
def _get_encoder():
    try:
        return tiktoken.get_encoding("cl100k_base")
    except Exception:
        return None

_ENCODER = _get_encoder()

def count_tokens(text: str) -> int:
    if not text:
        return 0
    if _ENCODER is None:
        # Rough fallback: ~4 chars per token
        return max(1, math.ceil(len(text) / 4))
    return len(_ENCODER.encode(text))

# ---- Robust JSON parsing ----
def try_parse_json(s: str) -> Dict[str, Any]:
    """Try strict JSON, then JSON5, then bracket extraction."""
    if not s:
        return {}
    # direct json
    try:
        return json.loads(s)
    except Exception:
        pass
    # json5
    try:
        return json5.loads(s)
    except Exception:
        pass
    # extract first {...}
    try:
        start = s.find('{')
        end = s.rfind('}')
        if start != -1 and end != -1 and end > start:
            return json.loads(s[start:end+1])
    except Exception:
        pass
    # last resort: return as note
    return {"notes": s.strip()}

def hash_url(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()[:10]

# ---- Trafi config for better extraction ----
trafi_cfg = use_config()
trafi_cfg.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")   # let httpx handle timeouts
trafi_cfg.set("DEFAULT", "EXTRACTION_TARGET_LANGUAGE", "en")

session = httpx.Client(timeout=FETCH_TIMEOUT, follow_redirects=True, headers={
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118 Safari/537.36"
})

def fetch_content(url: str) -> Tuple[str, Optional[str]]:
    """Fetch URL and return (extracted_text, error)."""
    try:
        r = session.get(url)
        r.raise_for_status()
        downloaded = trafilatura.extract(r.text, url=url, config=trafi_cfg, include_comments=False, include_links=False)
        if not downloaded:
            # fallback: strip tags crudely
            text = re.sub(r"<[^>]+>", " ", r.text)
            text = re.sub(r"\s+", " ", text).strip()
        else:
            text = downloaded
        return text, None
    except Exception as e:
        return "", f"{type(e).__name__}: {e}"

def ddg_search(query: str, max_results: int = MAX_RESULTS_PER_QUERY, region: str = REGION, safesearch: str = SAFESEARCH):
    results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, region=region, safesearch=safesearch, max_results=max_results):
            results.append({
                "title": r.get("title", ""),
                "href": r.get("href", ""),
                "body": r.get("body", ""),
                "source": r.get("source", ""),
            })
    return results

class SearchDoc(BaseModel):
    title: str
    url: str
    snippet: str
    text: Optional[str] = None
    id: str = Field(default_factory=lambda: f"doc_{int(time.time()*1000)}" )
    score: float = 0.0

class Scratchpad:
    def __init__(self, token_limit: int = SCRATCHPAD_TOKEN_LIMIT):
        self.token_limit = token_limit
        self.entries: List[Dict[str, Any]] = []  # {"depth":int, "notes":str, "citations":List[str]}
        self.total_tokens = 0

    def add(self, depth: int, notes: str, citations: List[str] = None):
        citations = citations or []
        tokens = count_tokens(notes)
        self.entries.append({"depth": depth, "notes": notes, "citations": citations, "tokens": tokens})
        self.total_tokens += tokens

    def to_text(self) -> str:
        chunks = []
        for i, e in enumerate(self.entries, 1):
            cites = " ".join(f"[{c}]" for c in e.get("citations", []))
            chunks.append(f"[d={e['depth']}] {e['notes']} {cites}".strip())
        return "\n\n".join(chunks)

    def _oldest_block_indexes(self, take: int = 4) -> List[int]:
        # Take a small window from the oldest entries for summarization
        return list(range(0, min(take, len(self.entries))))

    def over_budget(self) -> bool:
        return self.total_tokens > self.token_limit

    def entries_token_count(self, idxs: List[int]) -> int:
        return sum(self.entries[i]["tokens"] for i in idxs)

    def remove_and_insert_summary(self, idxs: List[int], summary: str):
        # Remove old entries and add a summarized one
        idxs_sorted = sorted(idxs, reverse=True)
        removed_tokens = 0
        citations = []
        for i in idxs_sorted:
            removed_tokens += self.entries[i]["tokens"]
            citations.extend(self.entries[i].get("citations", []))
            self.entries.pop(i)
        self.total_tokens -= removed_tokens
        self.add(depth=-1, notes=f"[Summarized] {summary}", citations=list(sorted(set(citations))))

    def __len__(self):
        return len(self.entries)

class ResearchState(BaseModel):
    question: str
    depth: int = 0
    seen_urls: set = Field(default_factory=set)
    docs: Dict[str, SearchDoc] = Field(default_factory=dict)
    report_chunks: List[str] = Field(default_factory=list)
    citations: Dict[str, str] = Field(default_factory=dict)  # id -> url
    no_new_queries_rounds: int = 0

# ---- Orchestrator ----

SYSTEM_PROMPT = """You are a meticulous research analyst. You follow this loop:
1) Read the research question and the provided search snippets.
2) Write compact, factual notes into a scratchpad (bullet points or short paragraphs).
3) Propose 3-6 NEXT web search queries to go deeper **top-down** (broad to narrow, then to key sub-questions).
4) Decide whether we can stop. Stop only if the final report is coherent, sourced, and covers the key sub-questions.

You must output STRICT JSON with these keys:
{
  "notes": "short notes capturing what matters (with inline source ids like [doc_abc123])",
  "next_queries": ["..."],
  "final_report_chunk": "<= 200 words coherent draft chunk with inline [doc_id] citations when appropriate",
  "should_stop": false,
  "confidence": 0.0
}
"""

SUMMARIZE_PROMPT = """Summarize the following scratchpad notes into a compact, loss-aware synthesis that preserves specific facts and internal references.
Return **plain text** under 250 words.

NOTES:
{notes}
"""

INITIAL_QUERY_PROMPT = """You are planning top-down research for the user question below.
Propose 4-6 broad, high-yield web search queries that map the landscape. Return **JSON**: {{"next_queries": ["..."]}}.

Question: {question}
"""


def make_md_citation(url: str, doc_id: str) -> str:
    return f"[{doc_id}]({url})"

def collect_context_text(docs: Dict[str, SearchDoc], limit_chars: int = 6000) -> str:
    # Build a compact context bundle of titles/snippets/text excerpts
    items = []
    running = 0
    for doc_id, d in docs.items():
        snippet = d.text or d.snippet or ""
        piece = f"{doc_id}: {d.title}\nURL: {d.url}\nEXCERPT: {snippet[:800]}\n\n"
        l = len(piece)
        if running + l > limit_chars:
            break
        items.append(piece)
        running += l
    return "".join(items)

@retry(stop=stop_after_attempt(2), wait=wait_random_exponential(min=1, max=4))
def reason_once(question: str, docs: Dict[str, SearchDoc], scratchpad: Scratchpad) -> Dict[str, Any]:
    context = collect_context_text(docs)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"Question:\n{question}\n\nSearch context:\n{context}\n\nScratchpad so far:\n{scratchpad.to_text()}"}
    ]
    out = llm_generate(messages, model=MODEL_NAME, temperature=0.2, max_tokens=1100)
    return try_parse_json(out)

def propose_initial_queries(question: str) -> List[str]:
    out = llm_generate(
        [
            {"role": "system", "content": "You help write only JSON."},
            {"role": "user", "content": INITIAL_QUERY_PROMPT.format(question=question)}
        ],
        model=MODEL_NAME,
        temperature=0.3,
        max_tokens=300,
    )
    j = try_parse_json(out)
    qs = j.get("next_queries") or []
    # fallback
    if not qs:
        qs = [f"Overview of {question}", f"Key studies about {question}", f"Latest developments in {question}"]
    return qs[:6]

def summarize_block(llm_text: str) -> str:
    return llm_generate(
        [
            {"role": "system", "content": "You write compact, loss-aware summaries."},
            {"role": "user", "content": SUMMARIZE_PROMPT.format(notes=llm_text)}
        ],
        model=MODEL_NAME,
        temperature=0.2,
        max_tokens=300,
    ).strip()

def recursively_summarize_if_needed(scratchpad: Scratchpad):
    # While over budget: compress a window from the oldest entries
    guard = 0
    while scratchpad.over_budget() and guard < 10 and len(scratchpad) > 2:
        idxs = scratchpad._oldest_block_indexes(take= min(6, len(scratchpad)//2))
        notes_text = "\n\n".join(scratchpad.entries[i]["notes"] for i in idxs)
        summary = summarize_block(notes_text)
        scratchpad.remove_and_insert_summary(idxs, summary)
        guard += 1
        console.log(f"[yellow]Scratchpad over budget → summarized {len(idxs)} entries.")


def run_search_round(queries: List[str], state: ResearchState):
    new_docs = 0
    for q in queries:
        results = ddg_search(q, max_results=MAX_RESULTS_PER_QUERY)
        for r in results:
            url = r.get("href") or r.get("url") or ""
            if not url or url in state.seen_urls:
                continue
            title = r.get("title", "").strip() or url
            snippet = r.get("body", "").strip()
            text, err = fetch_content(url)
            doc_id = f"doc_{hash_url(url)}"
            state.docs[doc_id] = SearchDoc(
                title=title, url=url, snippet=snippet, text=text[:6000] if text else ""
            )
            state.citations[doc_id] = url
            state.seen_urls.add(url)
            new_docs += 1
    return new_docs

def pretty_results_table(state: ResearchState, top_k: int = 10) -> Table:
    t = Table(title="Fetched documents", box=box.SIMPLE_HEAVY)
    t.add_column("doc_id", style="bold"), t.add_column("Title"), t.add_column("URL")
    for i, (doc_id, d) in enumerate(list(state.docs.items())[:top_k]):
        t.add_row(doc_id, d.title[:80], d.url[:80])
    return t

def render_report(report_chunks: List[str], citations: Dict[str, str]) -> str:
    report = "\n\n".join(report_chunks).strip()
    # Expand inline [doc_id] to markdown links
    def repl(m):
        doc_id = m.group(1)
        url = citations.get(doc_id, None)
        return f"[{doc_id}]({url})" if url else f"[{doc_id}]"
    report = re.sub(r"\[(doc_[a-zA-Z0-9]+)\]", repl, report)
    return report

def save_artifacts(question: str, scratchpad: Scratchpad, state: ResearchState, out_dir: str = "artifacts") -> Dict[str,str]:
    os.makedirs(out_dir, exist_ok=True)
    scratch_path = os.path.join(out_dir, "scratchpad.json")
    session_path = os.path.join(out_dir, "session.json")
    report_path = os.path.join(out_dir, "report.md")

    with open(scratch_path, "w", encoding="utf-8") as f:
        json.dump({"token_limit": scratchpad.token_limit, "total_tokens": scratchpad.total_tokens, "entries": scratchpad.entries}, f, ensure_ascii=False, indent=2)

    with open(session_path, "w", encoding="utf-8") as f:
        json.dump({
            "question": question,
            "depth": state.depth,
            "docs": {k: v.model_dump() for k, v in state.docs.items()},
            "citations": state.citations,
            "report_chunks": state.report_chunks,
        }, f, ensure_ascii=False, indent=2)

    report_md = render_report(state.report_chunks, state.citations)
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(f"# Research Report\n\n**Question:** {question}\n\n{report_md}\n")
    return {"scratchpad": scratch_path, "session": session_path, "report": report_path}

def deep_research(question: str,
                  max_depth: int = MAX_DEPTH,
                  target_report_words: int = TARGET_REPORT_WORDS,
                  scratchpad_token_limit: int = SCRATCHPAD_TOKEN_LIMIT) -> Dict[str, Any]:
    console.rule(f"[bold cyan]Deep Research: {question}")
    scratchpad = Scratchpad(token_limit=scratchpad_token_limit)
    state = ResearchState(question=question)

    # 0) Initial top-down queries
    init_queries = propose_initial_queries(question)
    console.print(Panel.fit("\n".join(f"• {q}" for q in init_queries), title="Initial queries", border_style="cyan"))
    added = run_search_round(init_queries, state)
    console.print(pretty_results_table(state))
    if added == 0:
        console.print("[yellow]No documents found in the first round. You may want to try rephrasing the question.")

    # Main loop
    while True:
        state.depth += 1
        console.rule(f"Depth {state.depth}")

        # 1) Reason over current evidence
        try:
            step = reason_once(question, state.docs, scratchpad)
        except Exception as e:
            console.print(f"[red]Reasoning failed:[/red] {e}\n{traceback.format_exc()}")
            break

        notes = step.get("notes", "").strip()
        if notes:
            # Extract inline [doc_id] refs to keep citations
            cites = re.findall(r"\[(doc_[a-zA-Z0-9]+)\]", notes)
            scratchpad.add(state.depth, notes, citations=cites)
            recursively_summarize_if_needed(scratchpad)

        # 2) Update report
        chunk = step.get("final_report_chunk", "").strip()
        if chunk:
            state.report_chunks.append(chunk)

        # 3) Next queries
        queries = [q for q in (step.get("next_queries") or []) if q and isinstance(q, str)]
        if queries:
            new_docs = run_search_round(queries, state)
            if new_docs == 0:
                state.no_new_queries_rounds += 1
            else:
                state.no_new_queries_rounds = 0
        else:
            state.no_new_queries_rounds += 1

        # 4) Termination checks
        words_so_far = len(re.findall(r"\w+", render_report(state.report_chunks, state.citations)))
        should_stop = bool(step.get("should_stop")) or state.depth >= max_depth or state.no_new_queries_rounds >= 2 or words_so_far >= target_report_words

        console.print(Panel.fit(
            f"Report words: {words_so_far} / {target_report_words}\nScratchpad tokens: {scratchpad.total_tokens} / {scratchpad.token_limit}\nNew docs this round: {0 if not queries else new_docs}\nNo-new-query rounds: {state.no_new_queries_rounds}\nShould stop (model): {bool(step.get('should_stop'))}",
            title="Progress", border_style="magenta"))

        if should_stop:
            console.print("[green]Stopping criteria met.")
            break

    # Save artifacts
    paths = save_artifacts(question, scratchpad, state, out_dir="artifacts")
    console.print(Panel.fit("Saved artifacts:\n- " + "\n- ".join(f"{k}: {v}" for k, v in paths.items()),
                            title="Artifacts", border_style="green"))
    return {
        "report_md": render_report(state.report_chunks, state.citations),
        "artifacts": paths
    }


In [None]:

QUESTION = "What are the most promising approaches to decarbonizing long-haul aviation by 2040, and what are the key technical and economic constraints?"

result = deep_research(QUESTION, max_depth=MAX_DEPTH, target_report_words=TARGET_REPORT_WORDS, scratchpad_token_limit=SCRATCHPAD_TOKEN_LIMIT)

print("\n\n===== FINAL REPORT (Markdown) =====\n")
print(result["report_md"])

print("\n\nFiles saved:")
for k, v in result["artifacts"].items():
    print(f"- {k}: {v}")