In [1]:
import os
from datetime import datetime
from typing import Type
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

from langchain.tools import BaseTool
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
import wikipedia
from duckduckgo_search import DDGS

In [2]:
class WikipediaTool(BaseTool):
    name = "wikipedia_search"
    description = "Search Wikipedia for information on a given topic. Input should be a search query string."
    
    def _run(self, query: str) -> str:
        try:
            # Search for pages
            search_results = wikipedia.search(query, results=3)
            if not search_results:
                return f"No Wikipedia results found for: {query}"
            
            # Get summary of the first result
            summary = wikipedia.summary(search_results[0], sentences=3)
            page = wikipedia.page(search_results[0])
            
            result = f"Wikipedia Result for '{query}':\n"
            result += f"Title: {page.title}\n"
            result += f"URL: {page.url}\n\n"
            result += f"Summary: {summary}\n\n"
            result += f"Other related pages: {', '.join(search_results[1:])}"
            
            return result
        except Exception as e:
            return f"Error searching Wikipedia: {str(e)}"
    
    async def _arun(self, query: str) -> str:
        return self._run(query)

wikipedia_tool = WikipediaTool()

In [3]:
class DuckDuckGoTool(BaseTool):
    name = "duckduckgo_search"
    description = "Search the web using DuckDuckGo for current information. Input should be a search query string."
    
    def _run(self, query: str) -> str:
        try:
            with DDGS() as ddgs:
                results = list(ddgs.text(query, max_results=5))
            
            if not results:
                return f"No DuckDuckGo results found for: {query}"
            
            result = f"DuckDuckGo Search Results for '{query}':\n\n"
            for i, item in enumerate(results, 1):
                result += f"{i}. {item['title']}\n"
                result += f"   URL: {item['href']}\n"
                result += f"   Snippet: {item['body']}\n\n"
            
            return result
        except Exception as e:
            return f"Error searching DuckDuckGo: {str(e)}"
    
    async def _arun(self, query: str) -> str:
        return self._run(query)

duckduckgo_tool = DuckDuckGoTool()

In [4]:
class WebScrapingTool(BaseTool):
    name = "web_scraper"
    description = "Scrapes content from a website URL and extracts text. Input should be a valid URL string."
    
    def _run(self, url: str) -> str:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Get text content
            text = soup.get_text()
            
            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            # Truncate if too long
            if len(text) > 2000:
                text = text[:2000] + "..."
            
            return f"Content from {url}:\n\n{text}"
            
        except Exception as e:
            return f"Error scraping {url}: {str(e)}"
    
    async def _arun(self, url: str) -> str:
        return self._run(url)

web_scraper = WebScrapingTool()

In [5]:
class FileSaveInput(BaseModel):
    filename: str = Field(description="Name of the file to save (should end with .txt)")
    content: str = Field(description="Content to save to the file")

class FileSaveTool(BaseTool):
    name = "file_saver"
    description = "Saves research content to a .txt file"
    args_schema: Type[BaseModel] = FileSaveInput
    
    def _run(self, filename: str, content: str) -> str:
        try:
            # Ensure filename ends with .txt
            if not filename.endswith('.txt'):
                filename += '.txt'
            
            # Add timestamp to content
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            full_content = f"Research Report - Generated on {timestamp}\n"
            full_content += "=" * 50 + "\n\n"
            full_content += content
            
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(full_content)
            
            return f"Successfully saved research to {filename}"
        except Exception as e:
            return f"Error saving file {filename}: {str(e)}"
    
    async def _arun(self, filename: str, content: str) -> str:
        return self._run(filename, content)

file_saver = FileSaveTool()

In [7]:
tools = [wikipedia_tool, duckduckgo_tool, web_scraper, file_saver]

openai_api_key = os.environ.get("OPENAI_API_KEY")

llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.1,
    api_key=openai_api_key,
)

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    max_iterations=10
)

In [8]:
prompt = "Research about the XZ backdoor. Start with Wikipedia, then search DuckDuckGo for recent information, scrape relevant websites, and save all findings to a file called 'xz_backdoor_research.txt'"

result = agent.invoke(prompt)

print("result:", result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I will start by searching for information about the XZ backdoor on Wikipedia. After that, I will look for recent information on DuckDuckGo, scrape relevant websites for more details, and finally save all the findings to a file.

Action:
```
{
  "action": "wikipedia_search",
  "action_input": {"query": "XZ backdoor"}
}
```[0m
Observation: [36;1m[1;3mWikipedia Result for 'XZ backdoor':
Title: XZ Utils backdoor
URL: https://en.wikipedia.org/wiki/XZ_Utils_backdoor

Summary: In February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan". The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution through OpenSSH on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of

  with DDGS() as ddgs:


[32;1m[1;3mAction:
```
{
  "action": "file_saver",
  "action_input": {
    "filename": "xz_backdoor_research.txt",
    "content": "Wikipedia Result for 'XZ backdoor':\n\nIn February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name 'Jia Tan'. The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution through OpenSSH on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of 10.0, the highest possible score.\n\nDuckDuckGo Search Results for 'XZ backdoor recent information':\n\n1. 知名压缩软件 xz 被发现有后门，影响有多大？如何应对？ - 知乎\n   URL: https://www.zhihu.com/question/650826484\n   Snippet: 2. xz-utils项目的原维护者Lasse Collin (L arhzu),有着定期进行internetbreaks的习惯，而且最近正在进行，导致这些变动他并没有review的机会，即使到现在也没能联系上他本人。 这可能也是 …\n\n2. xz后缀压缩包如何打开? - 百度知道\n   URL: https://zhidao.bai