### 1. Install Libaries

In [None]:
#!pip3 install langchain langchain-openai tiktoken langchain-community faiss-cpu 

### 2. Store Data Locally
Store your data in a directory. In this example, data files are stored in a subfolder called input_data. You can use curl or wget, or download it using your browser. Make sure you store it under a sub folder called input_data. 

In [32]:
# !curl "https://www.gutenberg.org/ebooks/2554.txt.utf-8" -o "Crime_and_Punishment_Dostoyevsky.txt"
# !wget -O "Crime_and_Punishment_Dostoyevsky.txt" "https://www.gutenberg.org/ebooks/2554.txt.utf-8"

### 3. Load OpenAI API Key
We will use OpenAI for embedding and text generation. If you don't have a key, you can sign up and generate a key. Visit [this page](https://platform.openai.com/api-keys) to create a secret key.  

In [1]:
from dotenv import load_dotenv
load_dotenv("C:/Users/prave/.env")

True

### 4. Load Data

In [2]:
from langchain.document_loaders import TextLoader

In [4]:
loader = TextLoader("./Crime_and_Punishment_Dostoyevsky.txt", encoding="utf-8")
document = loader.load()

### 5. Split Data into Chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
text_chunks = text_splitter.split_documents(document)

In [7]:
len(text_chunks)

1945

### 6. Store Data in a Vector Database

In [8]:
from langchain_openai.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [9]:
from langchain.vectorstores import FAISS

In [10]:
vector_store = FAISS.from_documents(text_chunks, embedding)

### 7. Create RAG App

In [11]:
retriever = vector_store.as_retriever()

In [12]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_openai.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

In [13]:
template="""As an assistant for answering questions related to the documents, 
use the following pieces of retrieved context to answer the question.
If the question cannot be answered based on the context, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [14]:
prompt = ChatPromptTemplate.from_template(template)
chat_model = ChatOpenAI(model="gpt-3.5-turbo")
parser = StrOutputParser()

### Alternative Way

In [17]:
retrieved_docs = retriever.invoke("Where does Raskolnikov live?")
print(len(retrieved_docs))
print(retrieved_docs)

4
[Document(id='1c435b09-2834-471c-b623-785aa881ed96', metadata={'source': './input_data/Crime_and_Punishment_Dostoyevsky.txt'}, page_content='Raskolnikov had long ceased to listen. Reaching the house where he\nlived, he nodded to Lebeziatnikov and went in at the gate. Lebeziatnikov\nwoke up with a start, looked about him and hurried on.\n\nRaskolnikov went into his little room and stood still in the middle\nof it. Why had he come back here? He looked at the yellow and tattered\npaper, at the dust, at his sofa.... From the yard came a loud continuous\nknocking; someone seemed to be hammering... He went to the window, rose\non tiptoe and looked out into the yard for a long time with an air of\nabsorbed attention. But the yard was empty and he could not see who was\nhammering. In the house on the left he saw some open windows; on the\nwindow-sills were pots of sickly-looking geraniums. Linen was hung out\nof the windows... He knew it all by heart. He turned away and sat down\non the sofa

In [16]:
messages = [
    (
        "system",
        "As an assistant for answering questions related to the documents, 
use the following pieces of retrieved context to answer the question.
If the question cannot be answered based on the context, just say that you don't know.",
    ),
    ("human", "I love programming."),
]

4

In [19]:
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="As an assistant for answering questions related to the documents, \nuse the following pieces of retrieved context to answer the question.\nIf the question cannot be answered based on the context, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})]


In [15]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | chat_model
    | parser
)

### 8. Testing

In [16]:
rag_chain.invoke("Where does Raskolnikov live?")

"Raskolnikov lives in flat Number 14 in Shil's house, not far from where he interacts with Pyotr Petrovitch and Razumihin. The lodging for the family he interacts with is in Bakaleyev's house in Voskresensky, which is nearby. The area is described as 'disgusting', 'filthy', and 'stinking'. Raskolnikov is stressed and anxious, feeling fearfully alone in his little room. He is known by the porter who works in the building where he lives. Raskolnikov is also seen looking intently into the darkening street, showing some disorientation or unease. There are queer people living near Raskolnikov's residence. Despite the challenging environment, Raskolnikov seems associated with the police station. The separation from Raskolnikov causes emotional distress and weeping among them at the thought of parting."

In [23]:
rag_chain.invoke("When was MLK born?")

'Sorry, based on the provided context, I cannot determine when MLK was born.'