## **문서 벡터 저장소, Vector Stores**

In [None]:
# !pip install langchain langchain-community langchain-openai langchain-chroma tiktoken pypdf sentence_transformers langchain-text-splitters

### **Langchain-Chroma 문서 저장 및 유사 문서 검색**

In [10]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
os.environ["OPENAI_API_KEY"] = "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"

openai_embedding=OpenAIEmbeddings(model = 'text-embedding-3-small')

loader = PyPDFLoader(r"C:\Users\82103\Downloads\수강신청_자료집_전체(2025-1)v4.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

db = Chroma.from_documents(docs, openai_embedding)

In [11]:
query = "김현경 교수님 수업 추천해줘?"
#유사 문서 검색
docs = db.similarity_search(query)
print(docs[0])

page_content='·········································································································· 982. 개설교과목 및 강의시간 ····························································································································· 99   가. 교양과목 안내······································································································································· 100   나. 타학과 전공인정' metadata={'creationdate': '2025-02-21T09:22:25+09:00', 'creator': 'Hwp 2022 12.0.0.3146', 'moddate': '2025-02-21T09:22:25+09:00', 'page': 4, 'page_label': '5', 'pdfversion': '1.4', 'producer': 'Hancom PDF 1.3.0.546', 'source': 'C:\\Users\\82103\\Downloads\\수강신청_자료집_전체(2025-1)v4.pdf', 'total_pages': 116}


In [12]:
#유사 문서 검색 및 유사도 출력
db.similarity_search_with_score(query)

[(Document(metadata={'creationdate': '2025-02-21T09:22:25+09:00', 'creator': 'Hwp 2022 12.0.0.3146', 'moddate': '2025-02-21T09:22:25+09:00', 'page': 4, 'page_label': '5', 'pdfversion': '1.4', 'producer': 'Hancom PDF 1.3.0.546', 'source': 'C:\\Users\\82103\\Downloads\\수강신청_자료집_전체(2025-1)v4.pdf', 'total_pages': 116}, page_content='·········································································································· 982. 개설교과목 및 강의시간 ····························································································································· 99   가. 교양과목 안내······································································································································· 100   나. 타학과 전공인정'),
  1.1502704620361328),
 (Document(metadata={'creationdate': '2025-02-21T09:22:25+09:00', 'creator': 'Hwp 2022 12.0.0.3146', 'moddate': '2025-02-21T09:22:25+09:00', 'page': 45, 'page_label': '46', 'pdfversion': '1.4', 'producer': 'Hancom PDF 1.3.0

**[벡터DB를 로컬 디스크에 저장하고 로드하기]**

In [13]:
Chroma().delete_collection()

In [14]:
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

loader = PyPDFLoader(r"C:\Users\82103\Downloads\수강신청_자료집_전체(2025-1)v4.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(pages)


#HuggingfaceEmbedding 함수로 Open source 임베딩 모델 로드
model_name = "jhgan/ko-sroberta-multitask"
ko_embedding= HuggingFaceEmbeddings(
    model_name=model_name
)


#save to disk
db2 = Chroma.from_documents(docs, ko_embedding, persist_directory="./chroma_db")

# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=ko_embedding)

query = "김현경교수님 수업 추천해줘"
result = db3.similarity_search(query)
print(result[0].page_content)

  ko_embedding= HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

### **Chroma DB API를 활용한 문서 관리**

**Collection 객체 생성과 문서 저장**

In [None]:
import chromadb
#collection을 저장할 경로 지정
client = chromadb.PersistentClient(path="collection_example")
#client가 잘 연결되어 있는지 확인
client.heartbeat()

**Collection 생성**

In [None]:
from chromadb.utils import embedding_functions
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
#OpenAI 임베딩 모델 활용
embedding_function = openai_embedding=OpenAIEmbeddings(model = 'text-embedding-3-small')
#Huggingface 오픈소스 임베딩 모델 활용
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="jhgan/ko-sbert-nli")

collection = client.create_collection(name="korean_law", embedding_function=embedding_function)

**Collection에 문서 임베딩 저장**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader


# load the document and split it into chunks
loader = PyPDFLoader(r"../data/대한민국헌법(헌법)(제00010호)(19880225).pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

collection.add(
    ids = [str(i) for i in range(len(docs))],
    documents=[i.page_content for i in docs],
    metadatas=[i.metadata for i in docs]
)

**Collection 로드하기**

In [None]:
#name에 collection 이름, embedding_function은 collection 저장 시 지정한 임베딩 모델 지정
collection = client.get_collection(name="korean_law", embedding_function=embedding_function)
collection

**Collection 내 문서 검색**

In [None]:
#1페이지에서 직업 선택의 자유와 유사한 청크 3개 검색
collection.query(
    query_texts=["직업 선택의 자유"],
    n_results=3,
    where={"page": 1},
)

**조건부 문서 검색**

In [None]:
#5페이지 이후의 청크 중에서 직업 선택의 자유와 관련한 문서 3개 검색
# $eq - 일치 (string, int, float)
# $ne - 불일치 (string, int, float)
# $gt - 초과 (int, float)
# $gte - 이상 (int, float)
# $lt - 미만 (int, float)
# $lte - 이하 (int, float)
collection.query(
    query_texts=["직업 선택의 자유"],
    n_results=3,
    where={"page": {"$gte": 5}}
)


In [None]:
collection.query(
    query_texts=["직업 선택의 자유"],
    n_results=3,
    where={"page": 1},
    where_document={"$contains": "직업"}
)