In [9]:
import os
import json
import re
from typing import Type, List
from pydantic import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import BaseTool, WikipediaQueryRun
from langchain.utilities import DuckDuckGoSearchAPIWrapper, WikipediaAPIWrapper
from langchain.document_loaders import WebBaseLoader
from langchain.schema import SystemMessage

# LLM for the agent (requires OPENAI_API_KEY in environment)
llm = ChatOpenAI(temperature=0.1, model_name="gpt-3.5-turbo-1106")

# Tool: Wikipedia search
wiki_tool = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(top_k_results=3, doc_content_chars_max=4000))

# Tool: DuckDuckGo search that returns top links as JSON
class DuckDuckGoLinksArgs(BaseModel):
    query: str = Field(description="Search query to find relevant web pages")

class DuckDuckGoLinksTool(BaseTool):
    name = "DuckDuckGoLinksTool"
    description = "Search DuckDuckGo and return a JSON list of top result URLs."
    args_schema: Type[DuckDuckGoLinksArgs] = DuckDuckGoLinksArgs

    def _run(self, query: str) -> str:
        ddg = DuckDuckGoSearchAPIWrapper()
        urls: List[str] = []
        try:
            if hasattr(ddg, "results"):
                results = ddg.results(query, num_results=5)
                for r in (results or []):
                    if isinstance(r, dict) and r.get("link"):
                        urls.append(r["link"])
                    elif isinstance(r, str) and r.startswith("http"):
                        urls.append(r)
            else:
                text = ddg.run(query)
                urls = re.findall(r"https?://\S+", text)[:5]
        except Exception:
            # Fallback: try .run() and regex-extract URLs
            try:
                text = ddg.run(query)
                urls = re.findall(r"https?://\S+", text)[:5]
            except Exception:
                urls = []
        return json.dumps(urls[:5])

# Tool: Web scraping via WebBaseLoader
class WebScrapeArgs(BaseModel):
    urls: List[str] = Field(description="List of URLs to scrape text content from")

class WebPageScrapeTool(BaseTool):
    name = "WebPageScrapeTool"
    description = "Scrape and extract readable text content from webpages using provided URLs."
    args_schema: Type[WebScrapeArgs] = WebScrapeArgs

    def _run(self, urls: List[str]) -> str:
        contents: List[str] = []
        for url in urls:
            try:
                loader = WebBaseLoader([url])
                docs = loader.load()
                contents.append("\n\n".join(doc.page_content for doc in docs))
            except Exception as e:
                contents.append(f"[Skipped {url} due to error: {str(e)[:200]}]")
        return "\n\n".join(contents)

# Tool: Save results to a .txt file
class SaveTextArgs(BaseModel):
    text: str = Field(description="The text content to write into the file")
    filename: str = Field(description="Destination .txt filepath", default="files/research_result.txt")

class SaveTextTool(BaseTool):
    name = "SaveTextTool"
    description = "Save research results into a .txt file. Always call this after finishing research."
    args_schema: Type[SaveTextArgs] = SaveTextArgs

    def _run(self, text: str, filename: str = "files/research_result.txt") -> str:
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(text)
        return f"Saved research to {filename}"

# Compose tools list
tools = [
    wiki_tool,
    DuckDuckGoLinksTool(),
    WebPageScrapeTool(),
    SaveTextTool(),
]

# System instructions to orchestrate the workflow
system_prompt: str = (
    "You are a research assistant. Given a user query, you should: "
    "1) Search Wikipedia and DuckDuckGo. "
    "2) From DuckDuckGoLinksTool, parse the JSON list of URLs and choose 1-3 relevant links. "
    "3) Use WebPageScrapeTool on those links to extract content. "
    "4) Summarize the findings concisely. "
    "5) Save the final summary to a .txt file via SaveTextTool with a descriptive filename like files/xz_backdoor_research.txt. "
    "Return the final answer and the saved path."
)

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={"system_message": SystemMessage(content=system_prompt)},
)

# Run the agent with the required query
query = "Research about the XZ backdoor"
result = agent.invoke(query)
result



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Wikipedia` with `XZ backdoor`


[0m[36;1m[1;3mPage: XZ Utils backdoor
Summary: In February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan". The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution through OpenSSH on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of 10.0, the highest possible score.
While xz is commonly present in most Linux distributions, at the time of discovery the backdoored version had not yet been widely deployed to production systems, but was present in development versions of major distributions. The backdoor was discovered by the software developer Andres Freund, who announced his findings on 29 March 2024.

Page: XZ Utils
Su

{'input': 'Research about the XZ backdoor',
 'output': 'I have saved the information about the XZ backdoor to a .txt file. You can download the file from the link below:\n[Download xz_backdoor_research.txt](sandbox:/files/xz_backdoor_research.txt)'}

In [10]:
# Persist the result to a specific file as required
output_text = None
if isinstance(result, dict) and "output" in result:
    output_text = result["output"]
else:
    output_text = str(result)

SaveTextTool()._run(text=output_text, filename="files/xz_backdoor_research.txt")


'Saved research to files/xz_backdoor_research.txt'