In [73]:
from typing import Any, Type
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
import re
import os

def save_link(title,link):
# URL 설정
#link = "https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/"

    # 웹 페이지 로드
    loader = WebBaseLoader(link)
    documents = loader.load()

    # 파일 저장 경로
    file_path = f".cache/{title}.txt"
    formatted_file_path = f".cache/{title}_formatted.txt"
    html_file_path =  f".cache/{title}.html"

    # 파일에 저장 (원본)
    with open(file_path, "w", encoding="utf-8") as f:
        for doc in documents:
            f.write(doc.page_content + "\n\n")

    print(f"페이지 내용을 {file_path} 파일에 저장 완료!")

    # 불필요한 부분 제거를 위한 패턴 목록
    patterns_to_remove = [
        r"Skip to main content.*?Menu",  # 상단 메뉴 제거
        r"Save this storySave.*?Learn more\.",  # 광고 및 구독 관련 텍스트 제거
        r"This story originally appeared on Ars Technica.*",  # 중복된 출처 정보 제거
        r"You Might Also Like ….*",  # 추천 기사 제거
        r"More From WIRED.*",  # WIRED의 기타 정보 제거
        r"SubscribeNewslettersFAQ.*",  # WIRED 푸터 제거
        r"© 2025 Condé Nast.*",  # 저작권 정보 제거
        r"Skip to content"
    ]

    # 긴 문장을 일정 길이에서 줄바꿈하는 함수
    def wrap_text(text, width=80):
        lines = []
        for paragraph in text.split("\n"):
            line = ""
            for word in paragraph.split():
                if len(line) + len(word) + 1 > width:
                    lines.append(line)
                    line = word
                else:
                    line += " " + word if line else word
            lines.append(line)
        return "\n".join(lines)

    # 원본 파일 읽기
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 불필요한 부분 제거
    for pattern in patterns_to_remove:
        content = re.sub(pattern, "", content, flags=re.DOTALL)

    # 연속된 공백 및 줄바꿈 정리
    content = re.sub(r"\n{2,}", "\n\n", content).strip()

    # 문장 줄바꿈 처리
    formatted_content = wrap_text(content, width=80)

    # 정리된 내용을 텍스트 파일로 저장
    with open(formatted_file_path, "w", encoding="utf-8") as f:
        f.write(formatted_content)



llm = ChatOpenAI(temperature=0.1)


class BackdoorSearchToolArgsSchema(BaseModel):
    query: str = Field(
        description="The query you will search for. Example: Information for Dark Comet"
    )

class DuckDuckGoSearchTool(BaseTool):
    name: str = "DuckDuckGoSearchTool"
    description: str = """
    Use this tool to find software utility and cybersecurity-related information.
    """
    args_schema: Type[BackdoorSearchToolArgsSchema] = BackdoorSearchToolArgsSchema

    def _run(self, query):
        ddg = DuckDuckGoSearchResults(output_format="json")
        return ddg.run(query)

class WikipediaSearchTool(BaseTool):
    name: str = "WikipediaSearchTool"
    description: str = """
    Use this tool to search for well-documented and reliable information about various topics.
    """

    args_schema: Type[BackdoorSearchToolArgsSchema] = BackdoorSearchToolArgsSchema

    def _run(self, query):
        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
        return wikipedia.run(query)

wiki_agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    tools=[WikipediaSearchTool()],
)
ducduckgo_agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    tools=[DuckDuckGoSearchTool()],
)

# Research about the XZ backdoor
prompt = "Give me information about the XZ backdoor."

result = wiki_agent.invoke(prompt)
print(result)

duckduckgo_result = ducduckgo_agent.invoke(prompt)
print(duckduckgo_result)



# output 문자열 가져오기
output_text = duckduckgo_result.get("output", "")
print(f"output_text={output_text}==output_text end")

# 정규식을 사용하여 [title](link) 패턴 찾기
matches = re.findall(r"\[(.*?)\]\((https?://[^\s\)]+)\)", output_text)
print(f"matches={matches}")

# .cache 디렉토리 생성 (없으면 만들기)
os.makedirs(".cache", exist_ok=True)

# 결과 출력 및 HTML 파일 저장
for title, link in matches:
    print(f"Title: {title}\nLink: {link}\n")

    # 파일 이름에서 사용 불가능한 문자 제거
    safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)

    # 웹 페이지 로딩
    save_link(safe_title,link)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `WikipediaSearchTool` with `{'query': 'XZ backdoor'}`


[0m[36;1m[1;3mPage: XZ Utils backdoor
Summary: In February 2024, a malicious backdoor was introduced to the Linux build of the xz utility within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan". The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution through OpenSSH on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of 10.0, the highest possible score.
While xz is commonly present in most Linux distributions, at the time of discovery the backdoored version had not yet been widely deployed to production systems, but was present in development versions of major distributions. The backdoor was discovered by the software developer Andres Freund, who announced his findings on 29 March 2