# Document Q/A Rag System

In [None]:
from google.colab import userdata
import os
os.environ['GOOGLE_API_KEY'] = userdata.get("GOOGLE_API_KEY")
os.environ['HUGGINGFACEHUB_ACCESS_TOKEN'] = userdata.get("HUGGINGFACEHUB_ACCESS_TOKEN")

In [None]:
!pip -q install langchain langchain-google-genai langchain-community faiss-cpu tiktoken python-dotenv pypdf langchain-huggingface

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings,ChatGoogleGenerativeAI,GoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings

# Testing

In [None]:
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
chat_model.invoke("HI")

AIMessage(content='Hi there! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []}, id='run--ad9b1598-224a-4a68-9283-960d3d64d9cb-0', usage_metadata={'input_tokens': 1, 'output_tokens': 11, 'total_tokens': 12, 'input_token_details': {'cache_read': 0}})

# Step 1a - Indexing (Document Ingestion)

In [None]:
loader = PyPDFLoader("/content/Docker Deep Dive.pdf")
docs = loader.load()

In [None]:
len(docs)

280

In [None]:
docs[0]

Document(metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 0, 'page_label': 'i'}, page_content='')

# Step 1b - Indexing(Text Splitting)

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

589

In [None]:
chunks[200]

Document(metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 93, 'page_label': '87'}, page_content="package listed onsearch.nixos.org.\nRun the following command to install thebind package (which includes thenslookup\ntool), and then run thenslookup command again.\ndocker > install bind\nTip: You can install any package available at: https://search.nixos.org/packages.\ninstalling 'bind-9.18.19'\n<Snip>\ndocker > nslookup nigelpoulton.com\nServer: 192.168.65.7\nAddress: 192.168.65.7#53\nNon-authoritative answer:\nName: nigelpoulton.com\nAddress: 192.124.249.126\nThe command worked, andnslookup is now installed in yourtoolbox and will be\navailable in future Docker Debug sessions.\nCongratulations, you’ve used Docker Debug to attach to a running container and run\ntroubleshooting commands t

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embeddings)

# Step 2 - Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type="similarity",search_kwargs={"k":4})

In [None]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7a371f318a70>, search_kwargs={'k': 4})

In [None]:
retriever.invoke("What is docker")

[Document(id='6a2a7961-c9d4-4dfd-95be-0d6b8453678c', metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 16, 'page_label': '10'}, page_content='Docker, Inc.\nDocker, Inc. is a technology company based out of Palo Alto and founded by French-\nborn American developer and entrepreneur Solomon Hykes. Solomon is no longer at\nthe company.\nThe company started as aplatform as a service (PaaS)provider calleddotCloud. Behind the\nscenes, dotCloud delivered their services on top of containers and had an in-house to\nhelp them deploy and manage those containers. They called this in-house toolDocker.\nThe wordDocker is a British expression meaningdock work____er____ that refers to a\nperson who loads and unloads cargo from ships.'),
 Document(id='2336e975-4819-46a0-98f7-7554529ced0b', metadata={'pro

# Step 3 - Augmentation

In [None]:
llm = GoogleGenerativeAI(model="models/gemini-1.5-flash")

In [None]:
prompt = PromptTemplate(
    template = """
    You are a helpful assistant.
    Answer ONLY from the provided transcript context.
    If the context is insufficient, just say you don't know.

    {context}

    Question: {question}
    """,
    input_variables=["context","question"]
)

In [None]:
question = "if the topic of aliens disscussed in this video? if yes then what was discussed"
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

[Document(id='d1c5b3c6-1dd1-4f11-a45f-3697b85b3cca', metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 10, 'page_label': '4'}, page_content='Part 1: The big picture stuff'),
 Document(id='370b39d2-4c70-4cdd-8ac9-df2f96b748a7', metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 36, 'page_label': '30'}, page_content='Part 2: The technical stuff'),
 Document(id='0f2d769e-f256-44f0-8f00-73afd4e9b2bb', metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/

In [None]:
context_text

'Part 1: The big picture stuff\n\nPart 2: The technical stuff\n\nContents\n0: About the book. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .1\nPart 1: The big picture stuff. . . . . . . . . . . . . . . . . . . . .4\n1: Containers from 30,000 feet. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .5\nThe bad old days. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nHello VMware! . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nVMwarts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nHello Containers!. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nLinux containers . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nHello Docker! . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\nDocker and Windows. . . . . . . . . . . . . . . . . . . . . . . . . .

In [None]:
final_prompt = prompt.invoke({"context":context_text,"question":question})

In [None]:
final_prompt

StringPromptValue(text="\n    You are a helpful assistant.\n    Answer ONLY from the provided transcript context.\n    If the context is insufficient, just say you don't know.\n\n    Part 1: The big picture stuff\n\nPart 2: The technical stuff\n\nContents\n0: About the book. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .1\nPart 1: The big picture stuff. . . . . . . . . . . . . . . . . . . . .4\n1: Containers from 30,000 feet. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .5\nThe bad old days. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nHello VMware! . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nVMwarts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nHello Containers!. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nLinux containers . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

# Step 4 - Generation

In [None]:
answer = llm.invoke(final_prompt)

In [None]:
answer

"I don't know.  The provided text is a table of contents and some concluding remarks about a book; it does not mention aliens."

# Building a Chain

In [None]:
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
}
)

In [None]:
parallel_chain.invoke('How to manage a app in a container')

{'context': '8: Containerizing an app\nDocker makes it easy to package applications as images and run them as containers.\nWe call this processcontainerization, and this chapter will walk you through the entire\nprocess.\nI’ve divided the chapter as follows:\n• Containerizing an app – The TLDR\n• Containerize a single-container app\n• Moving to production with multi-stage-builds\n• Buildx, BuildKit, drivers, and Build Cloud\n• Multi-architecture builds\n• A few good practices\nContainerizing an app – The TLDR\nDocker aims to make it easy tobuild, ship,and run applications. We call thiscontaineriza-\ntion and the process looks like this:\n1. Write your applications and create the list of dependencies\n2. Create aDockerfile that tells Docker how to build and run the app\n3. Build the app into an image\n4. Push the image to a registry (optional)\n5. Run a container from the image\nYou can see these five steps in Figure 8.1.\n\nrunning the app defined in the image.\nHow containers start ap