In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

pdf_folder = 'C:/Workspace/DA36_mini4/min/pdfs'
vectorstore_folder = 'C:/Workspace/DA36_mini4/min/vectorstores'

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

print("Processing all files for vs_all")
all_files = [os.path.join(pdf_folder, file) for file in os.listdir(pdf_folder) if file.endswith('.pdf')]

all_documents = []
for pdf_file in all_files:
    loader = PyPDFLoader(pdf_file)
    all_documents.extend(loader.load()) 

all_vector_store = FAISS.from_documents(all_documents, embeddings)

all_vectorstore_path = os.path.join(vectorstore_folder, 'vs_all')
os.makedirs(all_vectorstore_path, exist_ok=True)  # 디렉토리 생성
all_vector_store.save_local(all_vectorstore_path)

print("vs_all 벡터스토어 생성 완료!")

category_to_files = {
    '경마방법': ['race_guide.pdf', 'odds.pdf', 'winning_horse.pdf'],
    '경기스케쥴': ['202412_seoul.pdf', 'day_info.pdf'],
    '우승마기록': ['top50_2024.pdf'],
    '2024시행계획': ['race_plan2024.pdf'], 
}

# 카테고리와 영어 약어 매핑
category_to_abbreviation = {
    '경마방법': 'guide',
    '경기스케쥴': 'schedule',
    '우승마기록': 'winners',
    '2024시행계획': 'plan2024',
}

for category, files in category_to_files.items():
    print(f"Processing category: {category}")
    
    pdf_files = [os.path.join(pdf_folder, file) for file in files]

    # 문서 로드
    documents = []
    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents.extend(loader.load())  # 문서 추가

    # 벡터스토어 생성
    vector_store = FAISS.from_documents(documents, embeddings)

    # 벡터스토어 이름 
    abbreviation = category_to_abbreviation.get(category, 'other') 
    vectorstore_name = f"vs_{abbreviation}"

    # 카테고리별 벡터스토어 저장 경로
    category_path = os.path.join(vectorstore_folder, vectorstore_name)
    os.makedirs(category_path, exist_ok=True) 
    vector_store.save_local(category_path)

print("카테고리별 벡터스토어 생성 완료!")

Processing all files for vs_all
vs_all 벡터스토어 생성 완료!
Processing category: 경마방법
Processing category: 경기스케쥴
Processing category: 우승마기록
Processing category: 2024시행계획
카테고리별 벡터스토어 생성 완료!
