## Load and Transform

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
spliter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)

In [2]:
loader = UnstructuredFileLoader("../files/example_1984.docx")

loader.load_and_split(text_splitter=spliter)

[Document(page_content='Part 1, Chapter 1\n\nPart One', metadata={'source': '../files/example_1984.docx'}),
 Document(page_content='1', metadata={'source': '../files/example_1984.docx'}),
 Document(page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors', metadata={'source': '../files/example_1984.docx'}),
 Document(page_content='wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.', metadata={'source': '../files/example_1984.docx'}),
 Document(page_content='The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a', metadata={'source': '../files/example_1984.docx'}),
 Document(page_content=

In [3]:
from langchain.text_splitter import CharacterTextSplitter

spliter = CharacterTextSplitter(
    separator="\n", chunk_size=600, chunk_overlap=100
    )

In [4]:
len(loader.load_and_split(text_splitter=spliter))

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1741, which is longer than the specified 600
Created a chunk of size 2001, which is longer than the specified 600
Created a chunk of size 1900, which is longe

39

## Embedding Using Cache

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

In [6]:
loader = UnstructuredFileLoader("../files/example_1984.docx")

spliter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
    )
docs = loader.load_and_split(text_splitter=spliter)



In [7]:
embeddings = OpenAIEmbeddings()


In [8]:
cache_dir = LocalFileStore('../.cache/')

In [9]:
cache_embeddings = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings=embeddings, document_embedding_cache=cache_dir
    )

In [10]:
vectorstore = Chroma.from_documents(docs, cache_embeddings) # Chroma / FAISS

In [11]:
results = vectorstore.similarity_search("where does winston live")

In [12]:
results

[Document(page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liqu

## RetirevalQA

In [9]:
from langchain.chains import RetrievalQA

In [16]:
llm =  ChatOpenAI(
    temperature=0.1
)

In [14]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine", # "stuff" # "map_rerank" # "map_reduce"
    retriever=vectorstore.as_retriever(),
)

In [12]:
chain.run("Where does Winston live?")

'Winston lives in Victory Mansions, specifically on the seventh floor of the building.'

In [15]:
chain.run("Describe Victory Mansions")

'Victory Mansions is a run-down apartment building where Winston Smith, the protagonist of George Orwell\'s novel "1984," lives. The building is located in London, chief city of Airstrip One, which is the third most populous province of Oceania. Victory Mansions is described as dilapidated, with broken elevators, flickering lights, and a general sense of decay. The apartments are small and cramped, lacking basic amenities. The atmosphere in Victory Mansions is bleak and oppressive, reflecting the overall dystopian society depicted in the novel. The building\'s conditions, along with the constant surveillance and propaganda, contribute to the sense of fear and control that permeates the residents\' lives in Victory Mansions. The building is overshadowed by the Ministry of Truth, a massive pyramidal structure of glittering white concrete, which is one of the four Ministries that govern Oceania. In this environment, Winston finds solace in a hidden alcove where he can briefly escape the w

## Retrieval Implementation

In [13]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

In [17]:
llm =  ChatOpenAI(
    temperature=0.1
)

In [18]:
retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. It you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}")
])

chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

chain.invoke( "Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building with glass doors that Winston Smith enters. The hallway smells of boiled cabbage and old rag mats. There is a large colored poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU" at one end of the hallway. The building has seven floors, and the flat Winston lives in is on the seventh floor. The building has a faulty lift and the electric current is cut off during daylight hours as part of an economy drive in preparation for Hate Week.')

### Map Reduce Retriever

#### Documnets 수가 많을 때 사용할 수 있음

In [14]:
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

In [19]:
llm =  ChatOpenAI(
    temperature=0.1
)

In [20]:
map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",
        """
        Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim
        -----
        {context}
        """
        ),
        ("human", "{question}")
    ]
)
map_doc_chain = map_doc_prompt | llm 


In [28]:
def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_context, "question": question}
            ).content
            for doc in documents
    )

In [None]:
def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']
    results = []
    for document in documents:
        result = map_doc_chain.invoke({
            "context": document.page_content,
            "question": question
        }).content
        results.append(result)
    results = "\n\n".join(results)
    return results

In [29]:
map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

In [30]:

final_prompt = ChatPromptTemplate.from_messages([
    ("system", """
     Given the following extracted parts of a long documnet and a question, create a final answer. 
     If you don't know the answer, just say that you don't know. Don't try to make up an answer.
     -------
     {context}
    """),
    ("human", "{question}")
])

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

In [24]:
chain.invoke( "Describe Victory Mansions")

{'documents': [Document(page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of c

AIMessage(content="I don't know.")