In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.chains import LLMChain

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

openai_embedding=OpenAIEmbeddings(model = 'text-embedding-3-small')

# test 123s

In [3]:
# 교수명 기반 PDF 로더 (제목 포함)
def load_pdfs_with_metadata(root_dir):
    all_docs = []
    for professor_name in os.listdir(root_dir):
        prof_dir = os.path.join(root_dir, professor_name)
        if not os.path.isdir(prof_dir):
            continue
        for filename in os.listdir(prof_dir):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(prof_dir, filename)
                abs_path = os.path.abspath(pdf_path)
                if not os.path.isfile(abs_path):
                    print(f"[❌ 존재하지 않음] {abs_path}")
                    continue
                try:
                    loader = PyPDFLoader(abs_path)
                    pages = loader.load()
                    for page in pages:
                        page.metadata["professor"] = professor_name
                        page.metadata["title"] = filename[:-4]  # .pdf 제거
                    all_docs.extend(pages)
                except Exception as e:
                    print(f"[⚠️ 로딩 실패] {abs_path} → {e}")
    return all_docs


In [4]:
# Chroma DB 저장
def create_vector_store(docs, persist_directory="./chroma_db"):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_documents(docs)
    
    vectorstore = Chroma.from_documents(
        split_docs,
        embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
        persist_directory=persist_directory
    )
    return vectorstore

In [5]:
# 실행
root_pdf_folder = "../../data/교수님들 논문"  # 또는 절대경로로 지정
documents = load_pdfs_with_metadata(root_pdf_folder)
vectorstore = create_vector_store(documents)