## RAG application with PDF, using FAISS as vector store

In [1]:
# [REQUEST]
#   Create basic RAG App with PDF locally and FAISS vector store, save and load vector store?
# [RESPONSE]
#   I'll provide a comprehensive example of creating a RAG application with a PDF,
#   using FAISS as the vector store, and demonstrating how to save and load the vector store.
#   Here's a step-by-step implementation:

# [Install required packages]
# pip install langchain-community langchain-openai faiss-cpu pypdf langchain-text-splitters

In [2]:
import os
import getpass
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

### Set up OpenAI API Key

In [3]:
# Set up OpenAI API Key
# if "OPENAI_API_KEY" not in os.environ:
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

In [4]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
# openai_api_key = os.environ["OPENAI_API_KEY"]

# load_dotenv(override = True)
# api_key = os.getenv('OPENAI_API_KEY')

MODEL_GPT="gpt-4o-mini"

### Functions

In [5]:
# 1. Load the PDF
def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    return pages

In [6]:
# 2. Split the documents
def split_documents(pages):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=200
    )
    splits = text_splitter.split_documents(pages)
    return splits

In [7]:
# 3. Create Vector Store
def create_vector_store(splits, embeddings):
    vector_store = FAISS.from_documents(splits, embeddings)
    return vector_store

In [8]:
# 4. Save Vector Store
def save_vector_store(vector_store, path):
    vector_store.save_local(path)
    print(f"Vector store saved to {path}")

In [9]:
# 5. Load Vector Store
def load_vector_store(path, embeddings):
    loaded_store = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
    return loaded_store

In [10]:
# 6. Create RAG Chain
def create_rag_chain(vector_store):
    # Retriever
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    
    # Prompt Template
    template = """Answer the question based only on the following context:
    {context}
    
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    
    # LLM
    # llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    llm = ChatOpenAI(model=MODEL_GPT, temperature=0)
    
    # RAG Chain
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain

### Path to your PDF

In [11]:
pdf_path = "../../data/Be_Good.pdf"

### Embeddings

In [12]:
embeddings = OpenAIEmbeddings()

### Load and split PDF

In [13]:
pages = load_pdf(pdf_path)
splits = split_documents(pages)

In [14]:
print(type(pages))
print(type(splits))

<class 'list'>
<class 'list'>


### Create Vector Store

In [15]:
vector_store = create_vector_store(splits, embeddings)

In [16]:
print(type(vector_store))

<class 'langchain_community.vectorstores.faiss.FAISS'>


### Save Vector Store

In [17]:
save_vector_store(vector_store, "faiss_index")

Vector store saved to faiss_index


### Load Vector Store (simulating a new session)

In [18]:
loaded_vector_store = load_vector_store("faiss_index", embeddings)

In [19]:
print(type(loaded_vector_store))

<class 'langchain_community.vectorstores.faiss.FAISS'>


### Create RAG Chain

In [20]:
rag_chain = create_rag_chain(loaded_vector_store)

In [21]:
print(type(rag_chain))

<class 'langchain_core.runnables.base.RunnableSequence'>


### Example Query

In [22]:
query = "What is the main topic of this document?"
response = rag_chain.invoke(query)
print(type(query))
print(type(response))

<class 'str'>
<class 'str'>


In [23]:
print("\nQuery:", query)
print("\nResponse:", response)


Query: What is the main topic of this document?

Response: The main topic of the document is about the principles of creating successful startups, emphasizing the importance of making something that people want and prioritizing user satisfaction over immediate business models or profits. It discusses the idea that focusing on users can guide decision-making and lead to success.


## [chatbot-rag-pdf-faiss.py]

In [None]:
# import os
# import getpass
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_community.vectorstores import FAISS
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.runnables import RunnablePassthrough
# from langchain_core.output_parsers import StrOutputParser

# # Set up OpenAI API Key
# if "OPENAI_API_KEY" not in os.environ:
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

# # 1. Load the PDF
# def load_pdf(file_path):
#     loader = PyPDFLoader(file_path)
#     pages = loader.load()
#     return pages

# # 2. Split the documents
# def split_documents(pages):
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=1000, 
#         chunk_overlap=200
#     )
#     splits = text_splitter.split_documents(pages)
#     return splits

# # 3. Create Vector Store
# def create_vector_store(splits, embeddings):
#     vector_store = FAISS.from_documents(splits, embeddings)
#     return vector_store

# # 4. Save Vector Store
# def save_vector_store(vector_store, path):
#     vector_store.save_local(path)
#     print(f"Vector store saved to {path}")

# # 5. Load Vector Store
# def load_vector_store(path, embeddings):
#     loaded_store = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
#     return loaded_store

# # 6. Create RAG Chain
# def create_rag_chain(vector_store):
#     # Retriever
#     retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    
#     # Prompt Template
#     template = """Answer the question based only on the following context:
#     {context}
    
#     Question: {question}
#     """
#     prompt = ChatPromptTemplate.from_template(template)
    
#     # LLM
#     llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
#     # RAG Chain
#     rag_chain = (
#         {"context": retriever, "question": RunnablePassthrough()}
#         | prompt
#         | llm
#         | StrOutputParser()
#     )
    
#     return rag_chain

# # Main Execution
# def main():
#     # Path to your PDF
#     pdf_path = "path/to/your/document.pdf"
    
#     # Embeddings
#     embeddings = OpenAIEmbeddings()
    
#     # Load and split PDF
#     pages = load_pdf(pdf_path)
#     splits = split_documents(pages)
    
#     # Create Vector Store
#     vector_store = create_vector_store(splits, embeddings)
    
#     # Save Vector Store
#     save_vector_store(vector_store, "faiss_index")
    
#     # Load Vector Store (simulating a new session)
#     loaded_vector_store = load_vector_store("faiss_index", embeddings)
    
#     # Create RAG Chain
#     rag_chain = create_rag_chain(loaded_vector_store)
    
#     # Example Query
#     query = "What is the main topic of this document?"
#     response = rag_chain.invoke(query)
#     print("\nQuery:", query)
#     print("\nResponse:", response)

# # Run the main function
# if __name__ == "__main__":
#     main()
