# Process Flow for RAG application

1. Load the file (PDF, HTML etc)
2. Transform the data (converting into chunks)
3. Create Embeddings
4. Store embeddings in vector database
5. Use Chain and Retriver


In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.llms import Ollama


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the pdf

pdf_loader = PyPDFLoader("C:\\Users\\omkar\\OneDrive\\Desktop\\Omkar\\Resume\\Omkar_Firame_Resume.pdf")
docs = pdf_loader.load()
docs

[Document(page_content="Omkar\nFirame\n+919404350615\n|\nE-Mail\n|\nLinkedin\n|\nGitHub\nSUMMAR Y:\nExperienced\nData\nScientist\nadept\nat\ndeveloping\nadvanced\nmachine\nlearning\nmodels\nand\nharnessing\ndata-driven\nstrategies\nto\ntackle\nintricate\nbusiness\nproblems\neffectively .\nTECHNICAL\nSKILLS:\nPython,\nR,\nMachine\nLearning,\nDeep\nLearning,\nPyTorch,\nIguazio,\nKubeFlow\nPipeline,\nAzure\nDatabricks,\nDask,\nWeb\nScraping\n(Selenium),\nGit,\nLangchain,\nGenerative\nAI\nEXPERIENCE:\nData\nScientist\n-\nWynum\nAutomation\nServices\nPvt.\nLtd\n|\nApril\n2023\n-\nPresent\n●\nManaged\na\nproject\nimplementing\nan\ninformation\nretrieval\nsystem\nfor\nextracting\nrelevant\npress\nreleases\nfrom\nonline\nsources.\n●\nApplied\nadvanced\ndata\npreprocessing\ntechniques\nto\ncleanse\nand\nstructure\ncontent,\nenhancing\ncategorization\nand\nanalytical\ncapabilities.\n●\nDeveloped\ncustom\nweb\nscrapers\nusing\nPython\n(Beautiful\nSoup,\nRequests)\nfor\nprecise\nand\ntargeted\ncon

In [3]:
# Transform the data
"""
chunk_size - max number of characters in one chunk
chunk_overlap - max number of characters that can be overlapped between 2 chunks 
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
split = text_splitter.split_documents(docs)
split

[Document(page_content='Omkar\nFirame\n+919404350615\n|\nE-Mail\n|\nLinkedin\n|\nGitHub\nSUMMAR Y:\nExperienced\nData\nScientist\nadept\nat\ndeveloping\nadvanced\nmachine\nlearning\nmodels\nand\nharnessing\ndata-driven\nstrategies\nto\ntackle\nintricate\nbusiness\nproblems\neffectively .\nTECHNICAL\nSKILLS:\nPython,\nR,\nMachine\nLearning,\nDeep\nLearning,\nPyTorch,\nIguazio,\nKubeFlow\nPipeline,\nAzure\nDatabricks,\nDask,\nWeb\nScraping\n(Selenium),\nGit,\nLangchain,\nGenerative\nAI\nEXPERIENCE:\nData\nScientist\n-\nWynum\nAutomation\nServices\nPvt.\nLtd\n|\nApril\n2023\n-\nPresent\n●\nManaged\na\nproject\nimplementing\nan\ninformation\nretrieval\nsystem\nfor\nextracting\nrelevant\npress\nreleases\nfrom\nonline\nsources.\n●\nApplied\nadvanced\ndata\npreprocessing\ntechniques\nto\ncleanse\nand\nstructure\ncontent,\nenhancing\ncategorization\nand\nanalytical\ncapabilities.\n●\nDeveloped\ncustom\nweb\nscrapers\nusing\nPython\n(Beautiful\nSoup,\nRequests)\nfor\nprecise\nand\ntargeted\ncon

In [4]:
# Create embeddings and store in vector database
vectorstore = FAISS.from_documents(documents=split, embedding=OllamaEmbeddings())


In [5]:
# model
model = Ollama(model="llama2")
model

Ollama()

In [6]:
# Prompt

from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context. Think step by step before providing the answer. 
NOTE: Do not make up the answer. Provide the answer as text only.
Context: {context}
Question: {input}
Answer:
""")

In [7]:
# Chain
from langchain.chains.combine_documents import create_stuff_documents_chain
chain = create_stuff_documents_chain(model,prompt)

In [8]:
# retriver
retriver = vectorstore.as_retriever()
retriver


VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F17BA05340>)

In [9]:
# retriver chain
from langchain.chains import create_retrieval_chain

retriver_chain = create_retrieval_chain(retriver,chain)

In [10]:
response = retriver_chain.invoke({"input":"Name on the resume"})

In [11]:
response['answer']

'The name of the person listed on the resume is Omkar Firame.'