### File Loader

- text loader
- pdf loader
- unstructured loader 등 다양함
  -> unstructured loader의 경우 확장자에 상관없이 사용 가능
  - https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed


### Text splitters

- loader로 불러온 텍스트를 분할하기 위해 사용
- 필요한 부분만을 잘라내서 탐색하므로 더 효율적임


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

chat = ChatOpenAI(temperature=0.1)

#? chunk_size : 문서를 분할할 사이즈 설정
#? chun_overlap : 불완전한 문장으로 분할되는 것을 보완하기 위해 문단의 뒷부분을 가져와서 다음 섹션에 붙임
# splitter = RecursiveCharacterTextSplitter(
#   chunk_size=200,
#   chunk_overlap=50
# )
splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
  # length_function=len
)

loader = UnstructuredFileLoader("./files/test.pdf")
docs = loader.load_and_split(text_splitter=splitter)


### Embedder

- 자료를 자료가 가진 속성에 따라 벡터로 표현


In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
embedder.embed_query("Hi")

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
vector = embedder.embed_documents([
  "Hi",
  "How",
  "are",
  "you",
  "You can embedd quiet long sentences"
])

for i in vector:
  print(len(i))


### Vector Store

- Embed한 자료를 저장
- CacheBackedEmbeddings를 통해 caching 가능


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

chat = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
  # length_function=len
)

loader = UnstructuredFileLoader("./files/1984.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)
#? cache되지 않은 요청은 embeddings를 사용
#? cache된 요청은 cache에서 불러옴
vectorstore = Chroma.from_documents(docs, cached_embeddings)
results = vectorstore.similarity_search("Where does winston live?")
print(results)

### RetrievalQA

- stuff: 모든 문서를 프롬프트에 채워넣는 방식
- refine(정제): 제공된 문서마다 답변을 생성하고 문서를 순회하며 답변을 개선 -> 더 비쌈
- map reduce: document를 개별적으로 요약해서 LLM에게 전달
- map re-rank: 각 doc에 대해 답변을 생성하고 점수를 매겨서 가장 높은 점수를 반환


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

chat = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/1984.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
  llm=chat,
  chain_type="stuff",
  retriever=vectorstore.as_retriever()
)

result = chain.run("Describe Victory Mansions")
print(result)

### Stuff LCEL Chain


In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

chat = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/1984.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
  ("system", "You are helpful assistant. Answer questions using only the following context. If you don'k know the answer just say you don'k know, don't make it up:\n\n{context}"),
  ("human", "{question}")  
])

chain = {"context":retriever, "question":RunnablePassthrough()} | prompt | chat
result = chain.invoke("Describe Victory Mansions")
print(result)

### Map Reduce LCEL Chain


In [15]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

chat = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
  separator="\n",
  chunk_size=600,
  chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/1984.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
  embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages([
  ("system",
    """
    Use the following portion of long documents to see if any of the text is relavant to answer the question.
    Return any relavant text verbatim.
    -----
    {context}
    """
  ),
  ("human","{question}")
])

map_doc_chain = map_doc_prompt | chat

def map_docs(inputs):
  documents = inputs["documents"]
  question = inputs["question"]

  #? List comprehenstion
  return "\n\n".join(
    map_doc_chain.invoke({
      "context" : doc.page_content,
      "question": question
    }).content
    for doc in documents
  )

  # results = []

  # for document in documents:
  #   result = map_doc_chain.invoke({
  #     "context" : document.page_content,
  #     "question":question
  #     },
  #   ).content
  #   results.append(result)

  # results = "\n\n".join(results)
  # return results


map_chain = {"documents":retriever, "question":RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
  ("system", 
  """
  Given the following extracted parts of a long document and a question, create final answer.
  If you don't know the answer, just say that you don't know.
  Don't try to make up an answer.
  ------
  {context}
  """),
  ("human", "{question}")  
])

chain = {"context":map_chain, "question":RunnablePassthrough()} | final_prompt | chat
result = chain.invoke("Where does Winston go to work?")
print(result)

content='Winston goes to work at the Ministry of Truth.'
