<a href="https://colab.research.google.com/github/nischay1100/OpenDeepResearcher/blob/main/Milestone1_ScopePhase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cell 1 — Install required libraries

In [1]:
!pip install --upgrade langgraph google-generativeai tavily-python --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.1/56.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.7/216.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Imports and API key configuration

In [2]:
from getpass import getpass
import json, re, traceback, time
from typing import TypedDict

# LangGraph StateGraph
from langgraph.graph import StateGraph

# Google Gemini client
import google.generativeai as genai

# Tavily client
from tavily import TavilyClient

# Enter API keys
GENAI_API_KEY = getpass("Enter Google Gemini API Key: ")
TAVILY_API_KEY = getpass("Enter Tavily API Key: ")

# Configure clients
genai.configure(api_key=GENAI_API_KEY)
tavily = TavilyClient(api_key=TAVILY_API_KEY)


Enter Google Gemini API Key: ··········
Enter Tavily API Key: ··········


Cell 3 — State type definition

In [3]:
# Define the state structure for LangGraph nodes
class ResearchState(TypedDict, total=False):
    user_input: str
    clarification: str
    query: str
    summary: str
    pipeline: str


Cell 4 — Utility: safe response extractor

In [4]:
# Utility to safely extract text from Google generative responses
def safe_extract_genai_text(response):
    """
    Given a genai response object, try several known access patterns and return plain text.
    """
    try:
        # preferred: genai response may have .text
        if hasattr(response, "text") and response.text:
            return response.text.strip()
    except Exception:
        pass

    try:
        # older interface: candidates --> content --> parts
        cand = response.candidates[0]
        # try direct text
        if hasattr(cand, "content"):
            # some libs put text in content.parts[0].text
            content = getattr(cand, "content", None)
            if content and hasattr(content, "parts"):
                parts = content.parts
                if parts and len(parts) > 0 and getattr(parts[0], "text", None):
                    return parts[0].text.strip()
        # fallback to candidate.text
        if hasattr(cand, "text") and cand.text:
            return cand.text.strip()
    except Exception:
        pass

    # Last resort: stringify
    try:
        return str(response)
    except:
        return ""


Cell 5 — Clarification agent

In [5]:
# Clarification agent
def clarification_agent(state: ResearchState) -> ResearchState:
    user_input = state.get("user_input", "").strip()
    if not user_input:
        state["clarification"] = "Could you type your question?"
        return state

    # Fast rule-based checks
    if re.search(r"\bmy name is\b", user_input.lower()):
        state["clarification"] = "This request is clear"
        return state

    if re.search(r"\b(previous|last)\s*(que|question|query|sawal)\b", user_input.lower()):
        state["clarification"] = "This request is clear"
        return state

    # Ask Gemini to categorize the clarity
    prompt = f"""
You are a system that assesses clarity of user research questions.

Question: "{user_input}"

Classify the question into one of:
- clear
- vague_guessable
- too_vague

If you return "vague_guessable", provide a short "refined_question" that is a reasonable interpretation.
Respond ONLY in JSON with keys: "status" and "refined_question" (string or empty).
Example:
{{"status":"vague_guessable", "refined_question":"..."}}
"""

    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        text_out = safe_extract_genai_text(response)
        # attempt to parse JSON from the response
        parsed = {}
        try:
            parsed = json.loads(text_out)
        except Exception:
            # try to find a JSON substring
            m = re.search(r"\{.*\}", text_out, flags=re.DOTALL)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                except:
                    parsed = {}
        status = parsed.get("status", "").lower()
        refined = parsed.get("refined_question", "").strip()
    except Exception:
        traceback.print_exc()
        status, refined = "clear", ""

    if status == "clear":
        state["clarification"] = "This request is clear"
    elif status == "vague_guessable" and refined:
        state["clarification"] = refined
    else:
        state["clarification"] = "Could you provide more details about your question?"

    return state


Cell 6 — Query generator node

In [6]:
# Query generator
def query_generator(state: ResearchState) -> ResearchState:
    clarification = state.get("clarification", "")
    user_input = state.get("user_input", "")

    if clarification == "This request is clear":
        state["query"] = user_input
    elif clarification.startswith("Could you provide"):
        state["query"] = f"{user_input} (needs clarification: {clarification})"
    else:
        # If we have a refined phrasing, use it
        state["query"] = clarification or user_input

    return state


Cell 7 — Search decision, Tavily wrapper, and research pipeline

In [7]:
# decide_search + tavily wrapper + research pipeline
def decide_search(query: str) -> bool:
    """Ask Gemini whether a web search is required. Returns True if search is needed."""
    try:
        prompt = f"""
You are a decision module. Given a research question, answer whether it requires real-time web search
or can be answered from general knowledge (no web search). Return JSON: {{"need_search": true/false}}.

Question: "{query}"
"""
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        text_out = safe_extract_genai_text(response)
        # parse JSON
        parsed = {}
        try:
            parsed = json.loads(text_out)
        except:
            m = re.search(r"\{.*\}", text_out, flags=re.DOTALL)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                except:
                    parsed = {}
        return bool(parsed.get("need_search", True))
    except Exception:
        return True  # conservative default

def tavily_search(query: str, max_results: int = 5):
    """
    Wrapper for TavilyClient search.
    Returns a list of result dicts or a string description if the client isn't available.
    """
    try:
        # The tavily client API may differ between versions; try a few common names.
        if hasattr(tavily, "search"):
            results = tavily.search(query, max_results=max_results)
            return results
        elif hasattr(tavily, "query"):
            results = tavily.query(query, max_results=max_results)
            return results
        else:
            # fallback: call a generic 'run' if present
            if hasattr(tavily, "run"):
                return tavily.run(query)
            # if none of the above, return a placeholder
            return f"[Tavily client available but no known search method; query='{query}']"
    except Exception as e:
        return f"[Tavily search error: {str(e)}]"

def research_pipeline(state: ResearchState) -> ResearchState:
    """
    Main research node - decides memory shortcuts, whether to use web search,
    and produces a short summary (or placeholder).
    """
    global memory
    query = (state.get("query") or "").strip()
    if not query:
        state["pipeline"] = "No query provided."
        state["summary"] = "No summary available."
        return state

    # Memory shortcuts
    if re.search(r"\b(previous|last)\s*(que|question|query|sawal)\b", query.lower()):
        if memory["history"]:
            last_q = memory["history"][-1]["Q"]
            state["pipeline"] = "Retrieved from memory (history)."
            state["summary"] = f"Your previous question was: '{last_q}'"
        else:
            state["pipeline"] = "Memory empty."
            state["summary"] = "There is no previous question in memory."
        return state

    if "my name" in query.lower():
        name = memory["facts"].get("name", "I don’t know yet.")
        state["pipeline"] = "Retrieved from memory (facts)."
        state["summary"] = f"Your name is {name}."
        return state

    # Direct fact lookup
    facts = memory.get("facts", {})
    if query.lower() in (k.lower() for k in facts.keys()):
        matched = next((v for k, v in facts.items() if k.lower() == query.lower()), None)
        state["pipeline"] = f"Retrieved from memory: {matched}"
        state["summary"] = matched
        return state

    # Decide whether to use web search
    need_search = decide_search(query)

    model = genai.GenerativeModel("gemini-1.5-flash")
    try:
        if need_search:
            # Get web data
            tavily_results = tavily_search(query, max_results=5)
            # Combine into a short context for Gemini to summarize
            combined_info = f"Query: {query}\n\nWeb results (short): {json.dumps(tavily_results, default=str)[:4000]}\n\nSummarize the key findings in 3-5 bullet points."
            response = model.generate_content(combined_info)
            text_out = safe_extract_genai_text(response)
            state["pipeline"] = "Tavily + Gemini"
            state["summary"] = text_out
        else:
            # Use Gemini directly
            prompt = f"Query: {query}\nProvide a concise answer or short summary (3-5 lines)."
            response = model.generate_content(prompt)
            text_out = safe_extract_genai_text(response)
            state["pipeline"] = "Gemini Only"
            state["summary"] = text_out
    except Exception as e:
        traceback.print_exc()
        state["pipeline"] = f"Error during research: {str(e)}"
        state["summary"] = "An error occurred while fetching results."

    return state


Cell 8 — Build LangGraph StateGraph and compile

In [8]:
# Build the StateGraph
graph = StateGraph(ResearchState)

# Add nodes
graph.add_node("ClarificationAgent", clarification_agent)
graph.add_node("QueryGenerator", query_generator)
graph.add_node("ResearchPipeline", research_pipeline)

# Define flow
graph.set_entry_point("ClarificationAgent")
graph.add_edge("ClarificationAgent", "QueryGenerator")
graph.add_edge("QueryGenerator", "ResearchPipeline")

# Compile the graph into an app object
app = graph.compile()
print("✅ StateGraph compiled successfully.")


✅ StateGraph compiled successfully.


Cell 9 — Memory initialization and chat() function (single-turn)

In [9]:
# Initialize memory and provide a chat() function to process one input at a time
memory = {
    "facts": {},    # Persistent knowledge (like user name, facts)
    "history": []   # Conversation log
}

def extract_facts_with_gemini(text: str):
    """
    Use Gemini to extract personal facts in JSON-list format: [{"key":"...", "value":"..."}]
    Fallbacks are safe and non-fatal.
    """
    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"""
Extract any personal facts (name, age, location, role, company) from the following user sentence.
Return a JSON list of objects with "key" and "value". If none, return [].

Sentence: "{text}"
"""
    try:
        response = model.generate_content(prompt)
        text_out = safe_extract_genai_text(response)
        # try to parse
        facts = []
        try:
            facts = json.loads(text_out)
        except:
            # try to find JSON array substring
            m = re.search(r"\[.*\]", text_out, flags=re.DOTALL)
            if m:
                try:
                    facts = json.loads(m.group(0))
                except:
                    facts = []
        if not isinstance(facts, list):
            facts = []
        return facts
    except Exception:
        return []

def chat(user_input: str, remember_name_rule: bool = True):
    """
    Process a single user_input through the scope pipeline.
    Returns the final state dict.
    """
    global memory

    # 1) Quick custom name rule
    if remember_name_rule and re.search(r"\bmy name is\b", user_input.lower()):
        # extract the phrase after "my name is"
        try:
            name = user_input.lower().split("my name is", 1)[1].strip().split()[0]
            memory["facts"]["name"] = name.capitalize()
            print(f"✅ Stored name='{memory['facts']['name']}' in memory.")
        except Exception:
            pass
    else:
        # 1b) Use Gemini to detect facts
        try:
            facts_list = extract_facts_with_gemini(user_input)
            for f in facts_list:
                key = f.get("key", "").lower().strip()
                value = f.get("value", "").strip()
                if key and value:
                    memory["facts"][key] = value
                    print(f"✅ I'll remember your {key} = {value}")
        except Exception:
            pass

    # 2) Handle simple recall commands locally
    if user_input.lower().startswith("what is my"):
        key = user_input.lower().replace("what is my", "").strip()
        val = memory["facts"].get(key, "I don’t know yet.")
        print(f"Memory: {val}")
        return {"user_input": user_input, "clarification": "", "query": "", "pipeline": "recall", "summary": val}

    # 3) Create state and invoke the pipeline (LangGraph app)
    state: ResearchState = {
        "user_input": user_input,
        "clarification": "",
        "query": "",
        "summary": "",
        "pipeline": ""
    }

    try:
        state = app.invoke(state)
    except Exception as e:
        traceback.print_exc()
        state["pipeline"] = f"Graph invocation error: {str(e)}"
        state["summary"] = ""

    # 4) Show outputs
    print("\n### 🟢 Clarification Agent")
    print(state.get("clarification", ""))
    print("\n### 📌 Final Research Query")
    print(state.get("query", ""))
    print("\n### 🔎 Research Pipeline")
    print(state.get("pipeline", ""))
    print("\n### ✅ Final Summary")
    print(state.get("summary", ""))

    # 5) Save to memory/history
    memory["history"].append({
        "timestamp": time.time(),
        "Q": user_input,
        "clarification": state.get("clarification", ""),
        "query": state.get("query", ""),
        "pipeline": state.get("pipeline", ""),
        "A": state.get("summary", "")
    })

    return state


Cell 10 — continuous chatbot loop

In [11]:
# Continuous chatbot loop
print("🟢 OpenDeepResearcher Chatbot (type 'quit' or 'exit' to stop)\n")

while True:
    try:
        user_input = input("You: ").strip()
        if user_input.lower() in ["quit", "exit", "bye"]:
            print("👋 Goodbye! Session ended.")
            break

        # Process the user input through our chat() pipeline
        state = chat(user_input)

    except KeyboardInterrupt:
        print("\n👋 Interrupted. Goodbye!")
        break
    except Exception as e:
        print(f"⚠️ Error: {e}")


🟢 OpenDeepResearcher Chatbot (type 'quit' or 'exit' to stop)

You: what is my name
Memory: Priya,
You: my name is Nischay
✅ Stored name='Nischay' in memory.

### 🟢 Clarification Agent
This request is clear

### 📌 Final Research Query
my name is Nischay

### 🔎 Research Pipeline
Retrieved from memory (facts).

### ✅ Final Summary
Your name is Nischay.
You: where i am from

### 🟢 Clarification Agent
Could you provide more details about your question?

### 📌 Final Research Query
where i am from (needs clarification: Could you provide more details about your question?)

### 🔎 Research Pipeline
Gemini Only

### ✅ Final Summary
To answer "Where I am from," I need more context.  Are you asking about my origin as a large language model? My location? Or perhaps the location of the person asking the question?  Please specify your query for a relevant response.
You: where is my location

### 🟢 Clarification Agent
Could you provide more details about your question?

### 📌 Final Research Query
where

Traceback (most recent call last):
  File "/tmp/ipython-input-332683201.py", line 98, in research_pipeline
    response = model.generate_content(combined_info)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/generativeai/generative_models.py", line 331, in generate_content
    response = self._client.generate_content(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/client.py", line 835, in generate_content
    response = rpc(
               ^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/api_core/gapic_v1/method.py", line 131, in __call__
    return wrapped_func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/api_core/retry/retry_unary.py", line 294, in retry_wrapped_func
    return retry_target(
           ^^^^^^^^^^^^^
  File "/usr/local/lib


### 🟢 Clarification Agent
Could you provide more details about your question?

### 📌 Final Research Query
what is ai (needs clarification: Could you provide more details about your question?)

### 🔎 Research Pipeline
Error during research: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 50
Please retry in 39.222301847s.

### ✅ Final Summary
An error occurred while fetching results.
You: what is ai


Traceback (most recent call last):
  File "/tmp/ipython-input-218545909.py", line 36, in clarification_agent
    response = model.generate_content(prompt)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/generativeai/generative_models.py", line 331, in generate_content
    response = self._client.generate_content(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/client.py", line 835, in generate_content
    response = rpc(
               ^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/api_core/gapic_v1/method.py", line 131, in __call__
    return wrapped_func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/google/api_core/retry/retry_unary.py", line 294, in retry_wrapped_func
    return retry_target(
           ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/


### 🟢 Clarification Agent
This request is clear

### 📌 Final Research Query
what is ai

### 🔎 Research Pipeline
Error during research: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 50
Please retry in 31.469971691s.

### ✅ Final Summary
An error occurred while fetching results.
You: quit
👋 Goodbye! Session ended.
