In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/tax_with_table.pdf")
pages = loader.load()
len(pages)

In [None]:
print(pages[0])

In [None]:
print(pages[0].page_content)

In [None]:
print(pages[0].metadata)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)
text_splitter

In [None]:
splits = text_splitter.split_documents(pages)

len(splits)


In [None]:
print(splits[20])

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# 실제 OpenAI 임베딩 사용
embeddings_model = OpenAIEmbeddings(
    api_key=OPENAI_API_KEY,  # 실제 OpenAI API 키
    model="text-embedding-3-small"  # 또는 "text-embedding-3-large"
)
# 'message': 'Requested 316267 tokens, max 300000 tokens per request'
db = Chroma.from_documents(splits, embeddings_model)

In [None]:

#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# 무료 Hugging Face 임베딩 모델 사용
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

db = Chroma.from_documents(splits, embeddings_model)


In [None]:
query = "비과세소득에 해당하는 소득은 어떤 것들이 있나요? 비과세소득에 대하여 자세히 설명해 주세요."
docs = db.similarity_search(query)
print(docs[0].page_content)

In [None]:
print(docs[0].metadata)

In [None]:
print(len(docs))
print(docs)

In [None]:
retriever = db.as_retriever()
print(retriever)

In [None]:
# Prompt

from langchain_core.prompts import ChatPromptTemplate

template = '''Answer the question based only on the following context:
<context>
{context}
</context>

Question: {input}
'''

prompt = ChatPromptTemplate.from_template(template)


In [None]:
prompt

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

model = ChatOpenAI(
    model='gpt-3.5-turbo-0125', 
    temperature=0,
    api_key=OPENAI_API_KEY)

document_chain = create_stuff_documents_chain(model, prompt)

retrieval_chain = create_retrieval_chain(retriever, document_chain)

response = retrieval_chain.invoke({"input": "비과세소득에 해당하는 소득은 어떤 것들이 있나요? 비과세소득에 대하여 자세히 설명해 주세요."})


In [None]:
response

In [None]:
print(response['answer'])