In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
import gc

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_openai import OpenAIEmbeddings
from langchain_classic.storage import InMemoryStore
from langchain_chroma import Chroma

from konlpy.tag import Okt

In [2]:
gc.collect()
torch.cuda.empty_cache()

In [3]:
loader = PyPDFLoader('./data/투자설명서.pdf')

In [4]:
embeddings = HuggingFaceEmbeddings(model='BAAI/bge-m3', model_kwargs={'device':'cuda'})
docs = loader.load()
full_text = '\n\n'.join(doc.page_content for doc in docs)
text_splitter = SemanticChunker(embeddings=embeddings)
docs = text_splitter.create_documents([full_text])
for doc in docs:
    doc.metadata['source'] = '투자설명서.pdf'
print(len(docs))

243


In [5]:
okt = Okt()

def okt_tokenize(text):
    return okt.morphs(text, norm=True, stem=True)

In [6]:
bm25_retriever = BM25Retriever.from_documents(docs, preprocess_func=okt_tokenize)
bm25_retriever.k = 3