In [None]:
!pip install -q exa-py langchain langchain-openai python-dotenv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from dotenv import load_dotenv
load_dotenv('/content/drive/MyDrive/env.txt')

import os, re
from typing import List, Dict, Any, Tuple

from exa_py import Exa

from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers.string import StrOutputParser

In [None]:
exa = Exa(api_key=os.environ["EXA_API_KEY"])

@tool
def search_and_contents(query: str):
    """Search for webpages based on the query and retrieve their contents."""
    # This combines two API endpoints: search and contents retrieval
    return exa.search_and_contents(
        query, use_autoprompt=True, num_results=5, text=True, highlights=True
    )

@tool
def find_similar_and_contents(url: str):
    """Search for webpages similar to a given URL and retrieve their contents.
    The url passed in should be a URL returned from `search_and_contents`.
    """
    # This combines two API endpoints: find similar and contents retrieval
    return exa.find_similar_and_contents(url, num_results=5, text=True, highlights=True)

# Initialize the LLMs
# Use a more capable model for evaluation/refinement and writing
llm = ChatOpenAI(model="gpt-4.1", temperature=0)
planner_llm = ChatOpenAI(model="o3-mini")

# --- 2. Define Tools ---
# search_tool = TavilySearchResults(max_results=5)
researcher_tools = [search_and_contents]

In [None]:
# --- 3. Create Planning Agent ---
planner_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert research planner. Create a detailed, step-by-step research plan for the given topic, suitable for an academic paper. "
            "Output the plan as a numbered list of specific, concise research questions or objectives."
            "Example:\nTopic: Impact of AI on Healthcare\nPlan:\n1. Current AI applications in medical diagnosis?\n2. AI's role in drug discovery?\n3. Ethical considerations of AI in healthcare?\n4. Future trends of AI in healthcare?\n5. AI's impact on patient outcomes and costs."
        ),
        (
            "human",
            "Create a research plan for the topic: {topic}\n\nResearch Plan:"
        ),
    ]
)
# Use the designated planner LLM
planner_chain = planner_prompt | planner_llm | StrOutputParser()

# --- 4. Create Researcher Agent ---
# No changes needed here, it takes a specific question
researcher_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a diligent research assistant. Your goal is to use the search tool to find comprehensive information on the *specific research question* provided. "
            "Focus on factual information, key arguments, and evidence relevant *only* to this question."
            "Synthesize the findings concisely."
            "IMPORTANT: After synthesizing the findings, list the URLs of the primary sources you consulted to gather this information. Each URL should be on a new line, prefixed with 'Source URL: '. "
            "Example:\n\nSynthesized findings go here...\n\nSource URL: https://example.com/article1\nSource URL: https://anothersource.org/data\n"
            "Do not add introductions like 'Here's what I found...'. Just provide the synthesis and the source list."
        ),
        ("human", "Research Question: {research_question}\nProvide detailed search results and list source URLs."),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)
# Use the main LLM for research synthesis
researcher_agent_runnable = create_openai_functions_agent(llm, researcher_tools, researcher_prompt)
researcher_agent_executor = AgentExecutor(agent=researcher_agent_runnable, tools=researcher_tools, verbose=True)

# --- 5. Create Evaluator & Refiner Agent ---
# This agent now ALSO generates refinement questions based on weaknesses.
evaluator_refiner_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert academic evaluator. Your tasks are:\n"
            "1. Critically assess the provided aggregated research findings based on quality, relevance, depth, and coherence for the original topic. Focus on the synthesized text content, ignoring 'Source URL:' lines for evaluation.\n"
            "2. Identify overall strengths, weaknesses, sufficiency, and specific gaps.\n"
            "3. **If significant gaps or weaknesses are identified** that require more focused research, formulate 1-3 specific, new research questions to address *only* these weaknesses. These questions should guide a targeted follow-up search.\n"
            "4. Structure your output clearly:\n"
            "   - Start with 'Evaluation:' followed by your assessment (strengths, weaknesses, gaps).\n"
            "   - Then, add a section 'Further Research Needed:'. List the 1-3 new research questions here (numbered), or state 'None' if the initial research is sufficient.\n\n"
            "Example Output (if refinement needed):\n"
            "Evaluation:\n[Your detailed evaluation text here, discussing strengths, weaknesses like lack of detail on topic X, insufficient evidence for claim Y]...\n\n"
            "Further Research Needed:\n1. What specific quantitative data exists on the impact of X on Y?\n2. Are there recent case studies challenging the findings on Z?\n\n"
            "Example Output (if sufficient):\n"
            "Evaluation:\n[Your detailed evaluation text, concluding the research is comprehensive and addresses the core areas adequately]...\n\n"
            "Further Research Needed:\nNone"
        ),
        (
            "human",
            "Original Research Topic: {topic}\n\n"
            "Aggregated Research Findings:\n```\n{research_results}\n```\n\n"
            "Your Evaluation and Refinement Plan:"
        ),
    ]
)
# Use the main capable LLM for evaluation and refinement planning
evaluator_refiner_chain = evaluator_refiner_prompt | llm | StrOutputParser()

# --- 6. Create Writer Agent ---
# This agent now receives potentially expanded research content and the initial evaluation text.
writer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an academic writer. Synthesize the **final, potentially expanded, research findings** into a coherent draft section for an academic paper (e.g., literature review, key findings) based on the original topic and the provided evaluation."
            "The research findings may include initial results and potentially follow-up research addressing identified weaknesses.\n"
            "Use a formal, objective tone and structure the information logically in Markdown format.\n"
            "You have been provided with:\n"
            "   - An evaluation of the *initial* research.\n"
            "   - The *combined* research findings (initial + any refinement).\n"
            "   - A list of all source URLs consulted across all research stages.\n"
            "Your primary task is to synthesize the **combined research findings**, ensuring the final text addresses points raised in the evaluation where possible.\n"
            "Attempt to incorporate references where appropriate (e.g., '[citation needed]' or '(Source: [URL snippet])').\n"
            "Conclude the report with a 'References' section listing ALL unique source URLs provided.\n"
            "Format the 'References' section clearly (e.g., a bulleted list).\n"
            "Start the output directly with the report section."
        ),
        (
            "human",
            "Original Research Topic: {topic}\n\n"
            "Evaluation of Initial Research:\n```\n{evaluation_text}\n```\n\n"
            "Combined Research Findings to Synthesize:\n```\n{combined_research_findings}\n```\n\n"
            "List of All Source URLs Consulted:\n{source_urls}\n\n"
            "Draft Academic Paper Section (in Markdown, including References section):"
        ),
    ]
)
writer_chain = writer_prompt | llm | StrOutputParser()


# --- Helper Function to Parse Plan ---
def parse_research_plan(plan_string: str) -> list[str]:
    """Parses the numbered/bulleted list plan output into a list of strings."""
    plan_items = []
    # Try numbered list first
    matches = re.findall(r"^\s*\d+\.\s+(.*)", plan_string, re.MULTILINE)
    if not matches:
        # Fallback: Try bullet points or dashes
        matches = re.findall(r"^\s*[\*\-]\s+(.*)", plan_string, re.MULTILINE)

    if matches:
         for item in matches:
            plan_items.append(item.strip())
    elif plan_string.strip(): # If no list format detected, but there's content
        # Fallback: Split by newline, basic filtering
        lines = plan_string.strip().split('\n')
        plan_items = [line.strip() for line in lines if line.strip() and len(line.strip()) > 5] # Basic check

    # If still no items, but input wasn't empty, treat as one step
    if not plan_items and plan_string.strip():
        print("Warning: Could not parse plan into distinct steps. Treating the whole output as one step.")
        return [plan_string.strip()]

    return plan_items

# --- Helper Function to Extract URLs and Content ---
def extract_content_and_urls(text: str) -> Tuple[str, List[str]]:
    """Separates synthesized content from 'Source URL:' lines."""
    urls = []
    content_lines = []
    source_url_pattern = re.compile(r"^\s*Source URL:\s*(.*)", re.IGNORECASE)
    for line in text.splitlines():
        match = source_url_pattern.match(line)
        if match:
            urls.append(match.group(1).strip())
        else:
            content_lines.append(line)
    content = "\n".join(content_lines).strip()
    return content, urls


# --- Helper Function to Parse Evaluation and Refinement Tasks ---
def parse_evaluation_and_refinement_tasks(eval_output: str) -> Tuple[str, List[str]]:
    """Parses the evaluator output into evaluation text and refinement tasks."""
    evaluation_text = ""
    refinement_tasks = []

    # Try to split based on the expected headings
    eval_match = re.search(r"Evaluation:(.*?)Further Research Needed:", eval_output, re.DOTALL | re.IGNORECASE)
    tasks_match = re.search(r"Further Research Needed:(.*)", eval_output, re.DOTALL | re.IGNORECASE)

    if eval_match:
        evaluation_text = eval_match.group(1).strip()
    else:
        # Fallback if "Further Research Needed:" is missing (e.g., eval only)
        eval_only_match = re.search(r"Evaluation:(.*)", eval_output, re.DOTALL | re.IGNORECASE)
        if eval_only_match:
            evaluation_text = eval_only_match.group(1).strip()
        else: # If headings not found, assume the whole text is evaluation
             evaluation_text = eval_output.strip()


    if tasks_match:
        tasks_section = tasks_match.group(1).strip()
        if tasks_section.lower() != 'none':
            # Use the same parsing logic as for the initial plan
             refinement_tasks = parse_research_plan(tasks_section) # Reuse plan parsing logic
             # Simple line split fallback if parse_research_plan fails
             if not refinement_tasks and tasks_section:
                  refinement_tasks = [line.strip() for line in tasks_section.splitlines() if line.strip()]

    # Ensure evaluation_text is captured even if parsing fails partially
    if not evaluation_text and eval_output:
         evaluation_text = eval_output # Fallback

    return evaluation_text, refinement_tasks


# --- Helper Function to Sanitize Filename ---
def sanitize_filename(topic: str) -> str:
    """Removes or replaces characters invalid for filenames."""
    sanitized = topic.strip().replace(" ", "_")
    sanitized = re.sub(r'[\\/*?:"<>|]', "", sanitized)
    max_len = 100
    sanitized = sanitized[:max_len] if len(sanitized) > max_len else sanitized
    return f"{sanitized}.md" if sanitized else "research_report.md"

# --- Function to Write Output to Markdown File ---
def write_to_markdown(filename: str, topic: str, evaluation_text: str, report_with_refs: str):
    """Writes the evaluation and the final report (including references) to a markdown file."""
    try:
        with open(filename, "w", encoding="utf-8") as f:
            f.write(f"# Research Report: {topic}\n\n")
            # Write the evaluation text from the evaluator/refiner
            f.write("## Evaluation of Initial Research\n\n")
            f.write("```\n")
            f.write(evaluation_text.strip())
            f.write("\n```\n\n")
            f.write("---\n\n")
            # Write the final report generated by the writer
            f.write("## Final Report Draft\n\n")
            f.write(report_with_refs.strip())
        print(f"\nEvaluation and Report saved to: {filename}")
    except IOError as e:
        print(f"\nError writing to file {filename}: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred during file writing: {e}")

# --- 7. Orchestrate the Pipeline ---
def run_research_pipeline(topic: str):
    """
    Runs the research pipeline: Plan -> Research -> Evaluate & Refine -> [Optional Refine Research] -> Write -> Save.
    """
    print(f"--- Starting Research for Topic: {topic} ---")

    initial_evaluation_text = None # Store the text part of the evaluation
    final_report = None
    all_source_urls = set() # Use a set for unique URLs
    aggregated_research_content_list = [] # Store content pieces

    # == Step 1: Planning Agent ==
    print("\n--- Running Planning Agent ---")
    try:
        research_plan_str = planner_chain.invoke({"topic": topic})
        print(f"\n--- Generated Research Plan ---:\n{research_plan_str}")
        plan_steps = parse_research_plan(research_plan_str)
        if not plan_steps:
             print("Error: Planning agent failed to generate a usable plan.")
             return "Planning phase failed."
        print(f"\n--- Parsed Plan Steps ({len(plan_steps)}) ---:")
        for i, step in enumerate(plan_steps):
            print(f"{i+1}. {step}")
    except Exception as e:
        print(f"Error during planning phase: {e}")
        return f"Planning phase failed: {e}"

    # == Step 2: Initial Researcher Agent Execution ==
    print("\n--- Running Initial Research Phase ---")
    research_successful = True
    for i, step in enumerate(plan_steps):
        print(f"\n--- Researching Step {i+1}/{len(plan_steps)}: {step} ---")
        try:
            research_result = researcher_agent_executor.invoke({"research_question": step})
            step_output_raw = research_result['output']
            step_content, step_urls = extract_content_and_urls(step_output_raw)

            print(f"\n--- Synthesized Content for Step {i+1} ---:\n{step_content}")
            print(f"--- Sources for Step {i+1} ---:")
            if step_urls:
                for url in step_urls: print(f"- {url}")
                all_source_urls.update(step_urls)
            else:
                print("- No URLs listed.")

            # Store content with header
            aggregated_research_content_list.append(f"--- Initial Research on: {step} ---\n{step_content}\n")
        except Exception as e:
            print(f"Error during research step {i+1} ('{step}'): {e}")
            aggregated_research_content_list.append(f"--- Initial Research on: {step} ---\nFAILED: {e}\n")
            research_successful = False

    # Combine initial content for evaluation
    initial_aggregated_research_for_eval = "\n".join(aggregated_research_content_list)
    if not initial_aggregated_research_for_eval.strip():
        print("Error: No initial research content was gathered.")
        return "Initial research phase failed (no content)."
    if not research_successful:
         print("Warning: Some initial research steps failed.")

    print("\n--- Initial Research Content Aggregated ---")
    print(f"--- Total Unique Source URLs after initial phase: {len(all_source_urls)} ---")

    # == Step 3: Evaluator & Refiner Agent Execution ==
    print("\n--- Running Evaluator & Refiner Agent ---")
    refinement_tasks = []
    try:
        eval_refine_output = evaluator_refiner_chain.invoke({
            "topic": topic,
            "research_results": initial_aggregated_research_for_eval # Pass initial content
        })
        # Parse the output
        initial_evaluation_text, refinement_tasks = parse_evaluation_and_refinement_tasks(eval_refine_output)

        print(f"\n--- Evaluator Output ---:\n{initial_evaluation_text}") # Display evaluation
        if refinement_tasks:
            print("\n--- Refinement Tasks Identified ---:")
            for i, task in enumerate(refinement_tasks):
                print(f"{i+1}. {task}")
        else:
            print("\n--- No Refinement Tasks Identified ---")

    except Exception as e:
        print(f"Error during evaluation/refinement phase: {e}")
        # Store error in evaluation text, assume no refinement tasks
        initial_evaluation_text = f"Evaluation phase failed: {e}"
        refinement_tasks = []


    # == Step 4: Optional Refinement Research Execution ==
    if refinement_tasks:
        print("\n--- Running Refinement Research Phase ---")
        refinement_successful = True
        refined_content_list = []
        for i, task in enumerate(refinement_tasks):
            print(f"\n--- Researching Refinement Task {i+1}/{len(refinement_tasks)}: {task} ---")
            try:
                research_result = researcher_agent_executor.invoke({"research_question": task})
                step_output_raw = research_result['output']
                step_content, step_urls = extract_content_and_urls(step_output_raw)

                print(f"\n--- Synthesized Content for Task {i+1} ---:\n{step_content}")
                print(f"--- Sources for Task {i+1} ---:")
                if step_urls:
                    for url in step_urls: print(f"- {url}")
                    all_source_urls.update(step_urls) # Add new URLs
                else:
                    print("- No URLs listed.")

                # Store refined content with header
                refined_content_list.append(f"--- Refined Research on: {task} ---\n{step_content}\n")
            except Exception as e:
                print(f"Error during refinement research task {i+1} ('{task}'): {e}")
                refined_content_list.append(f"--- Refined Research on: {task} ---\nFAILED: {e}\n")
                refinement_successful = False

        if not refinement_successful:
            print("Warning: Some refinement research steps failed.")

        # Prepend initial results with refined results for the writer
        aggregated_research_content_list.extend(refined_content_list) # Add refined content
        print("\n--- Refinement Research Content Aggregated ---")
        print(f"--- Total Unique Source URLs after refinement: {len(all_source_urls)} ---")

    # Combine all research content for the writer
    final_combined_research = "\n".join(aggregated_research_content_list)

    # == Step 5: Writer Agent Execution ==
    if initial_evaluation_text and "failed:" not in initial_evaluation_text.lower():
        print("\n--- Running Writer Agent ---")
        # Prepare URL list string
        url_list_str = "\n".join(f"- {url}" for url in sorted(list(all_source_urls)))
        if not url_list_str: url_list_str = "No source URLs were collected."

        try:
            final_report = writer_chain.invoke({
                "topic": topic,
                "evaluation_text": initial_evaluation_text, # Pass the evaluator's assessment
                "combined_research_findings": final_combined_research, # Pass all content
                "source_urls": url_list_str
            })
            print(f"\n--- Writer Output (Final Report Draft) ---:\n{final_report}")
        except Exception as e:
            print(f"Error during writing phase: {e}")
            final_report = f"Writing phase failed: {e}"
    else:
         final_report = "Skipped due to evaluation failure."
         print("\n--- Skipping Writer Agent due to Evaluation Failure ---")

    # == Step 6: Write to Markdown File ==
    # Check if we have valid evaluation text and a successful final report
    if initial_evaluation_text and final_report and \
       "failed:" not in initial_evaluation_text.lower() and \
       "failed:" not in final_report.lower() and \
       "skipped" not in final_report.lower():
        md_filename = sanitize_filename(topic)
        # Pass the initial evaluation text and the final report from the writer
        write_to_markdown(md_filename, topic, initial_evaluation_text, final_report)
    else:
        print("\n--- Skipping file writing due to errors or skipped steps ---")

    print("\n--- Research Pipeline Complete ---")
    # Return the final report content or the last relevant error message
    if final_report and "failed:" not in final_report.lower() and "skipped" not in final_report.lower():
        return final_report
    elif initial_evaluation_text and "failed:" in initial_evaluation_text.lower():
        return initial_evaluation_text
    else:
        return "Pipeline encountered an error or was skipped."

In [None]:
run_research_pipeline("Compare the recent marketing strategy between Coca Cola and Pepsi")