In [1]:
from dotenv import load_dotenv
import os

load_dotenv(verbose=True)
key = os.getenv('OPENAI_API_KEY')

In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_chroma import Chroma

In [4]:
# 단계 1: 문서 로드(Load Documents)
loader = PyMuPDFLoader("./data/SPRI_AI_Brief_2023년12월호_F.pdf")
docs = loader.load()

In [6]:
# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

In [7]:
# 단계 3: 임베딩(Embedding) 생성
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [8]:
# 단계 4: DB 생성(Create DB) 및 저장
# 벡터스토어를 생성합니다.
DB_PATH = "./chroma_db"     # 저장 경로

# DB 생성
db = Chroma.from_documents(
    documents=split_documents, 
    embedding=embeddings,
    persist_directory=DB_PATH,
    collection_name='my_db'
)