In [1]:

import dotenv
from bs4 import BeautifulSoup
from langchain.agents import initialize_agent, AgentType
from langchain.document_loaders import WikipediaLoader
from langchain.schema import SystemMessage
from langchain.tools import BaseTool
from pydantic import Field
from pydantic.main import BaseModel

dotenv.load_dotenv()

True

In [2]:
from duckduckgo_search import DDGS
assignment_query = "Research about the XZ backdoor"
# ddg = DDGS()
# try:
#     result = ddg.text(query)
#     print(result)
# except Exception as e:
#     print("Error", e)

In [3]:
from langchain.tools import BaseTool
class WikipediaSearchToolArgsSchema(BaseModel):
    query: str = Field("The query for information")

class WikipediaSearchTool(BaseTool):
    name:str = "WikipediaSearchTool"
    description:str = """
    Search Wikipedia, get 3 important keywords from it, and returns as python array of keywords
    """
    def _run(self, query, **kwargs):
        wiki = WikipediaLoader(
            query=query,
        )
        return wiki

In [4]:
from typing import Type, TypedDict


class DuckDuckGoToolArgsSchema(BaseModel):
    queries: list[str] = Field(description="The queries search urls")

class DuckDuckGoSearchTool(BaseTool):
    name:str = "DuckDuckGoSearchTool"
    description:str = """
    This tool send queries to Duck Duck Go Search engine to get link associated with the query.
    """
    args_schema:Type[DuckDuckGoToolArgsSchema] = DuckDuckGoToolArgsSchema
    def _run(self, queries: list[str], **kwargs):
        ddgs = DDGS()
        result =  [ddgs.text(query, region='ko-KR', max_results=2)for query in queries]
        return result

In [5]:
from typing import TypedDict
import requests
class Info(TypedDict):
    title: str
    href: str
class ScrapingWebsitesToolArgsSchema(BaseModel):
    infos: list[Info] = Field(description="Arrays of information that has title and href.")
class ScrapingWebsitesTool(BaseTool):
    name: str = "ScrapingWebsitesTool"
    description:str = "DuckDuckGoSearchTool will give href to this tool and then this tools will scape the website. "
    args_schema:Type[ScrapingWebsitesToolArgsSchema] = ScrapingWebsitesToolArgsSchema
    def _run(self, infos:list[Info], **kwargs):
        results = []
        for info in infos:
            res = requests.get(info['href'])
            soup = BeautifulSoup(res.content, "html.parser")
            results.append({
                'title': info['title'],
                'href': info['href'],
                'text': soup.get_text(strip=True).replace("\n\n", "\n"),})
        return results

In [6]:
class SavingInfo(TypedDict):
    title: str
    href: str
    text: str
class SavingToFileToolArgsSchema(BaseModel):
    infos: list[SavingInfo] = Field(description="Arrays of dictionary that has title, href, and text as key. These text will be saved 'information.txt' as csv form")
class SavingToFileTool(BaseTool):
    name:str = "SavingToFileTool"
    description:str = "This tool will save all information from Scarping tool information"
    def _run(self, infos:list[SavingInfo], **kwargs):
        with open("./information.txt", "w") as f:
            f.write('title,href,text')
            for info in infos:
                print(info)
                # f.write(f'{info["title"]},{info["href"]},{info["text"]}')

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentType
llm = ChatOpenAI(
    model="gpt-4o-mini",
)
agent = initialize_agent(
    llm=llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=[
        WikipediaSearchTool(),
        DuckDuckGoSearchTool(),
        ScrapingWebsitesTool(),
        SavingToFileTool(),
    ],
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={
        "system_message": SystemMessage(content="""
        You are a good assistance for collection information about query.
        You will collect information from Wikipedia. Summary it and make array for searching web href from DuckDuckGo.
        Next, you will scrape information from those urls above hrefs using ScrapingWebsitesTool.
        And ScarpingWebsitesTool's result will be send to SavingToFileTool for saving file information.
        Beware that before send information from ScrapingWebsiteTool to SavingFileTool you must summary the text that each array of dictionary has. Good luck.
        """)
    }
)
result = agent.invoke(assignment_query)
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `WikipediaSearchTool` with `XZ backdoor`


[0m[36;1m[1;3m<langchain.document_loaders.wikipedia.WikipediaLoader object at 0x131081a20>[0m[32;1m[1;3m
Invoking: `DuckDuckGoSearchTool` with `{'queries': ['XZ backdoor', 'XZ backdoor malware', 'XZ backdoor security']}`


[0m[33;1m[1;3m[[{'title': 'XZ Utils backdoor - Wikipedia', 'href': 'https://en.wikipedia.org/wiki/XZ_Utils_backdoor', 'body': 'In February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan". [b][4] The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution capabilities on the affected Linux system.'}, {'title': 'FAQ on the xz-utils backdoor (CVE-2024-3094) - GitHub Gist', 'href': 'https://gist.github.com/thesamesam/223949d5a074ebc3dce9ee78baad9e27', 'body': 'On March 29th, 2024, a backdoor

KeyboardInterrupt: 