In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/newid/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

In [3]:
chain.invoke("How many ministries are mentioned")

AIMessage(content='Three ministries are mentioned.')

In [4]:
chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a dilapidated and run-down apartment complex where Winston resides in the novel "1984" by George Orwell. The building is in disrepair, with shabby living conditions and a general sense of neglect. It has glass doors, a hallway that smells of boiled cabbage and old rag mats, and a poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU." Winston\'s small and cramped apartment is seven flights up, with a faulty lift due to power cuts during daylight hours. The building also features a telescreen, a surveillance device that constantly broadcasts information. From Victory Mansions, one can see all four Ministries of Oceania, although the building is dwarfed by their grandeur. Winston\'s apartment has an unusual layout with the telescreen placed on the longer wall opposite the window, and an alcove where he can sit out of its sight. The presence of a unique old book in the room inspires Winston to take a particular action, despite Part

In [5]:
chain.invoke("Where does Winston go to work?")

AIMessage(content='Winston goes to work at the Ministry of Truth, which is described as his place of work in the text.')