In [3]:
import os
import streamlit as st

from dotenv import load_dotenv
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain import hub
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

In [None]:
load_dotenv()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    
)
loader = Docx2txtLoader('./tax.docx')
document_list = loader.load_and_split(text_splitter=text_splitter)
len(document_list)

embedding = OpenAIEmbeddings(model="text-embedding-3-large")

index_name = 'tax-index'
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever.invoke(query)


prompt = hub.pull("rlm/rag-prompt")
prompt

llm = ChatOpenAI(model='gpt-4o')

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

ai_message = qa_chain.invoke({"query": query})

#dictionary prompt



dictionary = ["사람을 나타내는 표현 -> 거주자"]

prompt = ChatPromptTemplate.from_template(f"""
    사용자의 질문을 보고, 우리의 사전을 참고해서 사용자의 질문을 변경해주세요.
    만약 변경할 필요가 없다고 판단 된다면, 사용자의 질문을 변경하지 않아도 됩니다.
    그런 경우에는 질문만 리턴해주세요
    사전: {dictionary}
    질문: {{question}}
    """)

dictionary_chain = prompt | llm | StrOutputParser()
#내 질문을 그대로 변경하지 않고 보낼지, 거주자로 변경해서 보낼지 판단하게됨.  ouparser에게 전달
#그 데이터 값이 dictionary_chain

tax_chain = {"query": dictionary_chain} | qa_chain

ai_message = tax_chain.invoke({"question": query})