# Document Q/A Rag System

In [None]:
from google.colab import userdata
import os
os.environ['GOOGLE_API_KEY'] = userdata.get("GOOGLE_API_KEY")
os.environ['HUGGINGFACEHUB_ACCESS_TOKEN'] = userdata.get("HUGGINGFACEHUB_ACCESS_TOKEN")

In [None]:
!pip -q install langchain langchain-google-genai langchain-community faiss-cpu tiktoken python-dotenv pypdf langchain-huggingface

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings,ChatGoogleGenerativeAI,GoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings

# Testing

In [None]:
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
chat_model.invoke("HI")

AIMessage(content='Hi there! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []}, id='run--ad9b1598-224a-4a68-9283-960d3d64d9cb-0', usage_metadata={'input_tokens': 1, 'output_tokens': 11, 'total_tokens': 12, 'input_token_details': {'cache_read': 0}})

# Step 1a - Indexing (Document Ingestion)

In [None]:
loader = PyPDFLoader("/content/Docker Deep Dive.pdf")
docs = loader.load()

In [None]:
len(docs)

280

In [None]:
docs[0]

Document(metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 0, 'page_label': 'i'}, page_content='')

# Step 1b - Indexing(Text Splitting)

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

589

In [None]:
chunks[200]

Document(metadata={'producer': 'XeTeX 0.99998', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-05-21T09:09:33+00:00', 'title': 'Docker Deep Dive', 'author': 'Nigel Poulton', 'source': '/content/Docker Deep Dive.pdf', 'total_pages': 280, 'page': 93, 'page_label': '87'}, page_content="package listed onsearch.nixos.org.\nRun the following command to install thebind package (which includes thenslookup\ntool), and then run thenslookup command again.\ndocker > install bind\nTip: You can install any package available at: https://search.nixos.org/packages.\ninstalling 'bind-9.18.19'\n<Snip>\ndocker > nslookup nigelpoulton.com\nServer: 192.168.65.7\nAddress: 192.168.65.7#53\nNon-authoritative answer:\nName: nigelpoulton.com\nAddress: 192.124.249.126\nThe command worked, andnslookup is now installed in yourtoolbox and will be\navailable in future Docker Debug sessions.\nCongratulations, you’ve used Docker Debug to attach to a running container and run\ntroubleshooting commands t

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embeddings)

# Step 2 - Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type="similarity",search_kwargs={"k":4})

In [None]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7a371f318a70>, search_kwargs={'k': 4})