In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS 
from langchain.embeddings import OpenAIEmbeddings # Embedding model for converting text to vectors
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [28]:
load_dotenv()

True

In [None]:
# Load LLM model
# You can replace "deepseek-r1-distill-llama-70b" with your desired model
llm=ChatGroq(model="deepseek-r1-distill-llama-70b")

## 1. Data Ingestion

#### Data Loading

In [None]:
file_path = os.path.join(os.getcwd(), "data", "sample.pdf")
loader = PyPDFLoader(file_path)
documents = loader.load()
len(documents)  # Each page is 1 document, this pdf fiile have 77 pages.

#### Chunking

In [None]:
# convert documents to chunks, this is experimental process and not mandatory. It only works with text documents.
# The text_splitter is used to split the documents into smaller chunks
# This is useful when the documents are too large to be processed in one go
# The RecursiveCharacterTextSplitter is a text splitter that splits the text into smaller chunks
# chunking is required to avoid token limit issues in LLMs
# and to improve the quality of the responses by providing more context
# chunk_size is the number of characters in each chunk
# chunk_overlap is the number of characters that overlap between chunks
# length_function is used to calculate the length of the text
# here we use len function to calculate the length of the text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150, length_function=len) # chunk_size=500
docs = text_splitter.split_documents(documents)
len(docs) # 765 Total number of chunks created from the documents

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'd:\\llmops\\document_portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 0, 'page_label': '1'}, page_content='Llama 2: Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗ Louis Martin† Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev')

In [20]:
docs[0].page_content  # Content of the first chunk

'Llama 2: Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗ Louis Martin† Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev'

In [21]:
docs[0].metadata  # Metadata of the first chunk, includes page number and source file

{'producer': 'pdfTeX-1.40.25',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2023-07-20T00:30:36+00:00',
 'author': '',
 'keywords': '',
 'moddate': '2023-07-20T00:30:36+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'subject': '',
 'title': '',
 'trapped': '/False',
 'source': 'd:\\llmops\\document_portal\\notebook\\data\\sample.pdf',
 'total_pages': 77,
 'page': 0,
 'page_label': '1'}

#### Embedding / Storing this data in Vector Database

In [29]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [32]:
# embedding_model.embed_documents(docs[0].page_content)  # Embedding the first chunk
len(embedding_model.embed_documents(docs[0].page_content)[0])  # Length of the embedding vector for the first chunk

768

In [34]:
# In memory vector database for storing embeddings
# FAISS is a library for efficient similarity search and clustering of dense vectors
# It is used to store the embeddings of the documents and to perform similarity search
# FAISS we can persist the vector database to disk and load it later
# Its not available in the cloud, so we use it for local development and testing
# The vectorstore is used to store the embeddings of the documents and to perform similarity search
vectorstore = FAISS.from_documents(docs, embedding_model)     

#### 2. Data Retrieval

In [43]:
# Perform similarity search on the vectorstore and perform Retrieval Process i.e. find the most similar chunks to the query from the vectorstore
# This will return the chunks that are most similar to the query    
# By default k=4, it returns the top 4 chunks that are most similar to the query
relevant_doc=vectorstore.similarity_search("llama2 finetuning benchmark results.")

In [44]:
relevant_doc

[Document(id='c34f962c-69fe-45ac-9d0b-e9225a0f7099', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'd:\\llmops\\document_portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 73, 'page_label': '74'}, page_content='Llama 2\n7B 0.28 0.25 0.29 0.50 0.36 0.37 0.21 0.34 0.32 0.50 0.28 0.19 0.26 0.32 0.44 0.51 0.30 0.2513B 0.24 0.25 0.35 0.50 0.41 0.36 0.24 0.39 0.35 0.48 0.31 0.18 0.27 0.34 0.46 0.66 0.35 0.2834B 0.27 0.24 0.33 0.56 0.41 0.36 0.26 0.32 0.36 0.53 0.33 0.07 0.26 0.30 0.45 0.56 0.26 0.3570B 0.31 0.29 0.35 0.51 0.41 0.45 0.27 0.34 0.40 0.52 0.36 0.12 0.28 0.31 0.45 0.65 0.33 0.20\nFine-tuned'),
 Document(id='fc23c5fb-fb85-4fc0-8e91-f9c72b55e982', metadata={'produ

In [None]:
# print(relevant_doc[0].page_content)  # Content of the second chunk that is most similar to the query
# print(relevant_doc[1].page_content)
# print(relevant_doc[2].page_content)  # Content of the third chunk that is most similar to the query
# print(relevant_doc[3].page_content)  # Content of the fourth chunk that is most similar to the query 

65B 14.27 31.59 21.90 14.89 23.51 22.27 17.16 18.91 28.40 19.32 28.71 22.00 20.03
Llama 2
7B 16.53 31.15 22.63 15.74 26.87 19.95 15.79 19.55 25.03 18.92 21.53 22.34 20.20
13B 21.29 37.25 22.81 17.77 32.65 24.13 21.05 20.19 35.40 27.69 26.99 28.26 23.84
34B 16.76 29.63 23.36 14.38 27.43 19.49 18.54 17.31 26.38 18.73 22.78 21.66 19.04
70B 21.29 32.90 25.91 16.92 30.60 21.35 16.93 21.47 30.42 20.12 31.05 28.43 22.35
Fine-tuned
ChatGPT 0.23 0.22 0.18 0 0.19 0 0.46 0 0.13 0 0.47 0 0.66


In [64]:
retriever = vectorstore.as_retriever(search_kwargs = {"k": 10})

## 3. Data Generation

In [77]:
# prompt template is used to format the input to the LLM
# It is used to create a template for the input to the LLM 
# e.g., user provide a question but before sending to LLM we added context i.e. retrieved docs content from the vectorstore
# The below text/prompt will be sent to LLM to generate an answer.
prompt_template = """
        Answer the question based on the context provided below. 
        If the context does not contain sufficient information, respond with: 
        "I do not have enough information about this."

        Context: {context}

        Question: {question}"""

In [78]:
# Prompt Template is used to format the input to the LLM
# It is used to create a template for the input to the LLM  
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["question", "context"]
)   

In [79]:
# This is the function to format the documents into a string
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

parser = StrOutputParser()

In [80]:
from langchain_core.runnables.passthrough import RunnablePassthrough

In [81]:
# Below is the main chain that combines the retriever, formatter, prompt, LLM and output parser
# It takes the question as input, retrieves the relevant documents, formats them, prompts the LLM with the formatted documents and the question, and returns the answer
# RunnablePassthrough is used to pass the question through the chain without any modification
# LCEL is used to create a chain of runnables that can be executed in sequence  
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [83]:
print(rag_chain.invoke("What is llama"))

<think>
Okay, so I need to figure out what "llama" refers to based on the given context. Let me read through the context carefully. 

First, I see mentions of "Llama 1" and "Llama 2." These seem to be models of some sort, possibly AI models. There are numbers like 7B, 13B, etc., which might refer to the size of the models in terms of parameters. The context also talks about pretraining, fine-tuning, and mentions things like GPU hours and carbon footprints, which are common in AI training discussions.

Looking further, there's a section about "Llama 2: Open Foundation and Fine-Tuned Chat Models," which suggests that Llama 2 is an AI model, specifically a chat model. The context mentions "training libraries," "Meta’s Research Super Cluster," and "production clusters," which are terms related to computing resources used for training large AI models.

Another part discusses the limitations of Llama 2-Chat, such as knowledge cessation post-pretraining and potential for hallucinations. These