## CacheBackedEmbeddings
* mbeddings는 재계산을 피하기 위해 저장되거나 일시적으로 캐시


In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.embeddings import CacheBackedEmbeddings

### VectorStore 에서 임베딩 사용
* 로컬 파일 시스템을 사용하여 임베딩을 저장하고 FAISS 벡터 스토어를 사용하여 검색하는 예제

In [3]:
%pip install --upgrade --quiet langchain-openai faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [6]:
from langchain.storage import LocalFileStore
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model="text-embedding-3-small")
store = LocalFileStore("./cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedding,
    store,
    namespace=embedding.model
)

In [7]:
list(store.yield_keys())

[]

In [8]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter

raw_doc = TextLoader("./data/keywords.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_doc)

%time db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 245 ms, sys: 18.7 ms, total: 264 ms
Wall time: 1.1 s


In [10]:
# 캐싱된 임베딩을 사용하여 FAISS 데이터베이스 생성
%time db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 4.14 ms, sys: 1.47 ms, total: 5.61 ms
Wall time: 4.56 ms


In [11]:
list(store.yield_keys())[:5]

['text-embedding-3-small74ae75af-9058-555e-aefa-082f0b4e0560',
 'text-embedding-3-small41e7391b-b68f-5e9f-bb07-3609bb83c3e2',
 'text-embedding-3-small0fd71f95-1342-512d-9d5b-3e3ab3c6bbe0',
 'text-embedding-3-small2112b0ec-6ade-59c9-b09c-755b33c3d32c',
 'text-embedding-3-smallcc824f84-d691-544f-9d9c-ca7e45470bb2']

### ByteStore 로 변경
* 비영구적인 InMemoryByteStore를 사용하여 동일한 캐시된 임베딩 객체를 생성하는 예시

In [17]:
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedding,
    store,
    namespace=embedding.model
)
list(store.yield_keys())

[]

In [18]:
raw_doc = TextLoader("./data/keywords.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_doc)

%time db = FAISS.from_documents(documents, cached_embedder)

CPU times: user 76.5 ms, sys: 2.78 ms, total: 79.3 ms
Wall time: 949 ms


In [23]:
%time db2 = FAISS.from_documents(documents, cached_embedder)

CPU times: user 3.67 ms, sys: 167 µs, total: 3.83 ms
Wall time: 3.77 ms


In [24]:
list(store.yield_keys())[:5]

['text-embedding-3-small41e7391b-b68f-5e9f-bb07-3609bb83c3e2',
 'text-embedding-3-smallcc824f84-d691-544f-9d9c-ca7e45470bb2',
 'text-embedding-3-small0fd71f95-1342-512d-9d5b-3e3ab3c6bbe0',
 'text-embedding-3-small2112b0ec-6ade-59c9-b09c-755b33c3d32c',
 'text-embedding-3-small7494a7c8-3399-52a1-85ef-f4d0a563d31f']