In [1]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_classic.tools.retriever import create_retriever_tool
from langchain_classic import hub
from langchain.agents import create_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver

In [2]:
embeddings = HuggingFaceEmbeddings(model='BAAI/bge-m3', model_kwargs={'device':'cuda'}, encode_kwargs={'batch_size':8})
llm_eval = ChatOpenAI(model='gpt-5-nano', temperature=0)
llm_gen = ChatOpenAI(model='gpt-5-mini', temperature=0)

In [3]:
def create_pdf_retriever(
        pdf_path: str,
        persist_directory: str,
        embedding_model,
) -> Chroma.as_retriever:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    full_text = '\n\n'.join(doc.page_content for doc in docs)
    text_splitter = SemanticChunker(embeddings=embedding_model)
    docs = text_splitter.create_documents([full_text])
    for doc in docs:
        doc.metadata['source'] = pdf_path
    vectorstore = Chroma.from_documents(persist_directory=persist_directory, documents=docs, embedding=embedding_model)

    return vectorstore.as_retriever()

In [5]:
japan_path = './data/ict_japan_2024.pdf'
usa_path = './data/ict_usa_2024.pdf'
japan_db_path = './data/db_ict_policy_japan_2024_2'
usa_db_path = './data/db_ict_policy_usa_2024_2'

In [6]:
retreiver_japan = create_pdf_retriever(
    pdf_path=japan_path,
    persist_directory=japan_db_path,
    embedding_model=embeddings
)

retriever_usa = create_pdf_retriever(
    pdf_path=usa_path,
    persist_directory=usa_db_path,
    embedding_model=embeddings
)

In [7]:
jp_engine = create_retriever_tool(
    retriever=retreiver_japan,
    name='japan_ict',
    description='일본의 ICT 시장 동향 정보를 제공합니다. 일본 ICT와 관련된 질문은 해당 도구를 사용하세요.'
)

us_engine = create_retriever_tool(
    retriever=retriever_usa,
    name='usa_ict',
    description='미국의 ICT 시장 동향 정보를 제공합니다. 일본 ICT와 관련된 질문은 해당 도구를 사용하세요.'
)

tools = [jp_engine, us_engine]

In [8]:
prompt_react = hub.pull('hwchase17/react')
print(prompt_react.template)
print('--prompt end--')

Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}
--prompt end--
