### File Loader

- text loader
- pdf loader
- unstructured loader 등 다양함
  -> unstructured loader의 경우 확장자에 상관없이 사용 가능
  - https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed


### Text splitters

- loader로 불러온 텍스트를 분할하기 위해 사용
- 필요한 부분만을 잘라내서 탐색하므로 더 효율적임


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

chat = ChatOpenAI(temperature=0.1)

#? chunk_size : 문서를 분할할 사이즈 설정
#? chun_overlap : 불완전한 문장으로 분할되는 것을 보완하기 위해 문단의 뒷부분을 가져와서 다음 섹션에 붙임
# splitter = RecursiveCharacterTextSplitter(
#   chunk_size=200,
#   chunk_overlap=50
# )
splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
  # length_function=len
)

loader = UnstructuredFileLoader("./files/test.pdf")
docs = loader.load_and_split(text_splitter=splitter)


### Embedder

- 자료를 자료가 가진 속성에 따라 벡터로 표현


In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
embedder.embed_query("Hi")

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
vector = embedder.embed_documents([
  "Hi",
  "How",
  "are",
  "you",
  "You can embedd quiet long sentences"
])

for i in vector:
  print(len(i))


### Vector Store

- Embed한 자료를 저장
- CacheBackedEmbeddings를 통해 caching 가능


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

chat = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
  # length_function=len
)

loader = UnstructuredFileLoader("./files/1984.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)
#? cache되지 않은 요청은 embeddings를 사용
#? cache된 요청은 cache에서 불러옴
vectorstore = Chroma.from_documents(docs, cached_embeddings)
results = vectorstore.similarity_search("Where does winston live?")
print(results)
