In [1]:
import gc
import torch
import time

from pydantic import BaseModel, Field
from typing import List, Dict, Any, Tuple
from textwrap import dedent

from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_core.output_parsers import JsonOutputParser
from langchain_openai import ChatOpenAI

In [2]:
loader = PyPDFLoader('./data/투자설명서.pdf')

In [3]:
embeddings = HuggingFaceEmbeddings(model='BAAI/bge-m3', model_kwargs={'device':'cuda'}, encode_kwargs={'batch_size':8})

In [4]:
docs = loader.load()
full_text = '\n\n'.join(doc.page_content for doc in docs)
text_splitter = SemanticChunker(embeddings=embeddings)
docs = text_splitter.create_documents([full_text])
for doc in docs:
    doc.metadata['source'] = '투자설명서.pdf'
print(len(docs))

243


In [5]:
gc.collect()
torch.cuda.empty_cache()
time.sleep(3)

In [6]:
faiss_store = FAISS.from_documents(docs, embedding=embeddings)
persist_dir = './data/faiss_index_dense'
faiss_store.save_local(persist_dir)

In [7]:
vectorstore = FAISS.load_local(persist_dir, embeddings=embeddings, allow_dangerous_deserialization=True)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x2a682e16e90>

In [8]:
gc.collect()
torch.cuda.empty_cache()
time.sleep(3)

In [10]:
class RelevanceScore(BaseModel):
    relevance_score: float = Field(description='문서가 쿼리와 얼마나 관련이 있는지를 나타내는 점수.')

In [13]:
def reranking_documents(query: str, docs: List[Document], top_n: int=3) -> List[Document]:
    parser = JsonOutputParser(pydantic_object=RelevanceScore)
    human_msg_prompt = PromptTemplate(
        template='''
1점부터 10점까지 점수를 매겨, 다음 문서가 질문과 얼마나 관련이 있는지 평가해주세요.
단순히 키워드가 일치하는 것이 아니라 쿼리의 구체적인 맥락과 의도를 고려하세요.
{format_instruction}
question: {query}
document: {doc}
relevance_score''',
        input_variables=['query', 'doc'],
        partial_variables={'format_instruction':parser.get_format_instructions()}
    )
    llm = ChatOpenAI(model='gpt-5-nano', temperature=0, max_completion_tokens=1024)
    chain = (
        human_msg_prompt
        | llm
        | parser
    )
    score_docs = []
    for doc in docs:
        input_data = {'query':query, 'doc':doc.page_content}
        try:
            score = float(chain.invoke(input_data)['relevance_score'])
        except Exception as e:
            print(f'오류 발생: {str(e)}')
            default_score = 5
            print(f'기본 점수 {default_score}점을 사용합니다.')
            score = default_score
        score_docs.append((doc, score))
    
    reranked_docs = sorted(score_docs, key=lambda x:x[1], reverse=True)

    return [doc for doc, _ in reranked_docs[:top_n]]