In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_groq import ChatGroq
llm = ChatGroq(model="deepseek-r1-distill-llama-70b")

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [4]:
query_embed = embedding_model.embed_query("what is the capital of India?")
query_embed[0:10]

[0.042393896728754044,
 -0.03852992132306099,
 -0.04543685540556908,
 -0.010400405153632164,
 0.057711824774742126,
 0.018246879801154137,
 0.010995607823133469,
 -0.03159891068935394,
 -0.0061469534412026405,
 0.08103373646736145]

## Data Ingestion

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [6]:
file_path = os.path.join(os.getcwd(),"data","sample.pdf")

In [7]:
loader = PyPDFLoader(file_path)
documents = loader.load()
len(documents)  #each page represents one document

77

## Splitting the data : forming chunks

In [8]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 200,
    length_function = len   #splits on the basis of characters, i.e. each chunk will be made up of 500 characters
)
splitted_documents = splitter.split_documents(documents)
len(splitted_documents)

838

In [9]:
print(splitted_documents[0],"\n")   #page_label = 1 --> actual page number is 1, page_index = 0
print(splitted_documents[1].page_content)

page_content='Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗ Louis Martin† Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller
Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou
Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev' metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'c:\\LLM_Course_Projects\\document_portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 0, 'page_label': '1'} 

Cynt

In [10]:
splitted_documents[-1]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'c:\\LLM_Course_Projects\\document_portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 76, 'page_label': '77'}, page_content='applications ofLlama 2, developers should perform safety testing and tuning tailored to their\nspecific applications of the model. Please see the Responsible Use Guide available available at\nhttps://ai.meta.com/llama/responsible-user-guide\nTable 52: Model card forLlama 2.\n77')

## Embedding the chunks and storing it into vector DB

In [11]:
from langchain.vectorstores import FAISS
vector_store = FAISS.from_documents(splitted_documents,embedding_model)

#token->words
# chunk--> it is collection of words(token)[characters]

In [12]:
ans = vector_store.similarity_search("What is llama2?",k=2)  #By default it returns top 4 documents
ans[1].page_content

'Llama 2openly to encourage responsible AI innovation. Based on our experience, an open approach draws\nuponthecollectivewisdom,diversity,andingenuityoftheAI-practitionercommunitytorealizethebenefitsof\nthis technology. Collaboration will make these models better and safer. The entire AI community—academic\nresearchers, civil society, policymakers, and industry—must work together to rigorously analyze and expose'

### Retriever

In [13]:
retriever = vector_store.as_retriever(search_kwargs={"k":7})
ans2 = retriever.invoke("What is llama2?")
ans2[1].page_content

'Llama 2openly to encourage responsible AI innovation. Based on our experience, an open approach draws\nuponthecollectivewisdom,diversity,andingenuityoftheAI-practitionercommunitytorealizethebenefitsof\nthis technology. Collaboration will make these models better and safer. The entire AI community—academic\nresearchers, civil society, policymakers, and industry—must work together to rigorously analyze and expose'

### Based on user question answering

In [14]:
from langchain.prompts import PromptTemplate
prompt_temp = """
You're an expert in providing the answer's to the user question based on the context given to you.
If you don't find the context prompt user with : 
"I do not have the relevant information, please repharse or ask a different question."

Context : {context}
Question : {question}

Answer : 
"""

In [15]:
prompt=PromptTemplate(
    template=prompt_temp,
    input_variables=["context", "question"]
)

In [16]:
from langchain_core.output_parsers import StrOutputParser
parse = StrOutputParser()

In [17]:
from langchain_core.runnables import RunnablePassthrough

In [18]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [19]:
rag_chain = (
    {"context":retriever| format_docs, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

In [21]:
rag_chain.invoke("what's llama2 bench marking experiments?")

"<think>\nOkay, so the user is asking about Llama 2's benchmarking experiments. I need to figure out how to answer this based on the provided context. Let me go through the context step by step.\n\nFirst, I see there are two main tables: Table 20 and Table 3. Table 20 shows performance on standard benchmarks for both Llama 1 and Llama 2, with different model sizes (7B, 13B, etc.). Each model has a series of numbers which likely represent different benchmark metrics. For example, Llama 2 70B has scores like 85.0, 82.8, etc. \n\nThen, Table 3 seems to focus on overall performance across grouped academic benchmarks, again comparing Llama 1 and Llama 2. The numbers here are also in a similar format, so they probably represent different metrics within academic settings.\n\nThere's also a mention of Human-Eval and MBPP code generation benchmarks. The context provides specific numbers for Llama 2 models of various sizes in these areas. For instance, the 70B model has a pass@1 of 29.9 and pass