In [None]:
import os

if "COLAB_GPU" in os.environ:
  print("[INFO] Running in Google Colab, installing requirements.")
  !pip install PyMuPDF
  !pip install tqdm

  !pip install accelerate
  !pip install bitsandbytes
  !pip install flash-attn --no-build-isolation

In [2]:
import os
import requests
pdf_path="human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  url="https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename=pdf_path

  response=requests.get(url)

  if response.status_code==200:
    with open(filename,"wb") as file:
      file.write(response.content)
    print(f"The file has been downloaded and saved as {filename}")
  else:
    print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [3]:
'''
Iterate over pdf doc.
create a dictinary where each entry has info about a page from doc
'''

import fitz
from tqdm.auto import tqdm

def text_formatter(text:str)->str:
  cleaned_text=text.replace("\n"," ").strip()
  return cleaned_text

def open_and_read_pdf(pdf_path:str)->list[dict]:
  doc=fitz.open(pdf_path)
  pages_and_texts=[]
  for page_number, page in tqdm(enumerate(doc)):
    text=page.get_text()
    text=text_formatter(text)
    pages_and_texts.append({"page_number": page_number-41,
                            "page_char_count":len(text),
                            "page_word_count":len(text.split(" ")),
                            "page_sentence_count_raw":len(text.split(". ")),
                            "page_token_count":len(text)/4,
                            "text":text
                            })
  return pages_and_texts

pages_and_texts=open_and_read_pdf(pdf_path=pdf_path)
print(len(pages_and_texts))
pages_and_texts[:2]

0it [00:00, ?it/s]

1208


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [4]:
import random

random.sample(pages_and_texts,k=3)

[{'page_number': 495,
  'page_char_count': 1841,
  'page_word_count': 311,
  'page_sentence_count_raw': 13,
  'page_token_count': 460.25,
  'text': 'Societal Influence  In the United States, many societal factors influence the number of  calories burned in a day. Escalators, moving walkways, and elevators  (not to mention cars!) are common modes of transportation that  reduce average daily energy expenditure. Office work, high-stress  jobs, and occupations requiring extended working hours are all  societal pressures that reduce the time allotted for exercise of large  populations of Americans. Even the remote controls that many have  for various electronic devices in their homes contribute to the US  society being less active. More obesogenic factors were discussed in  the weight management section of this chapter.  Socioeconomic status has been found to be inversely proportional  to weight gain. One reason for this relationship is that inhabitants  of low-income neighborhoods have red

In [5]:
import pandas as pd

df=pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
# average token count is ~287

df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


In [7]:


# from spacy.lang.en import English

# nlp=English()

# nlp.add_pipe("sentencizer")

# doc=nlp("This is a sentence. This another sentence.")
# assert len(list(doc.sents))==2

# list(doc.sents)

In [8]:
from spacy.lang.en import English
'''
split page text to sentences since its easier to handle larger pages of text.
use Spacy to break text into sentences.
this will add another entry in dictionary 'sentences'.
'''

nlp=English()

nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
  item['sentences']=list(nlp(item['text']).sents)

  item['sentences']=[str(sentences) for sentences in item['sentences']]

  item['age_sentence_count_spacy']=len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_texts,k=1)

[{'page_number': -16,
  'page_char_count': 399,
  'page_word_count': 70,
  'page_sentence_count_raw': 2,
  'page_token_count': 99.75,
  'text': 'About the Contributors  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  This open access textbook was made possible through the  collaboration of faculty, students and staff at the University of  Hawai‘i at Mānoa demonstrating the value of working together,  ho‘okahi ka ‘ilau like ana.  Faculty  Jennifer Draper  xxvi  |  About the Contributors',
  'sentences': ['About the Contributors  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  This open access textbook was made possible through the  collaboration of faculty, students and staff at the University of  Hawai‘i at Mānoa demonstrating the value of working together,  ho‘okahi ka ‘ilau like ana.',
   ' Faculty  Jennifer Draper  xxvi  |  About the Contributors'],
  'age_sentence_count_spacy

In [10]:
# average sentences per page is ~10 and average token count per page is 287.
# so we can create chunk of 10 sentences per page
# so on average a group of 10 sentences will also be ~287 tokens long.
# which gives plenty of room for embedding model all-mpnet-base-v2.
df=pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,age_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [11]:
# split our group of sentences into chunks of 10 or less.
# now dictionary will have a sentence_chunk for each page.

num_sentence_chunk_size=10

def split_list(input_list:list,slice_size:int)->list[list[str]]:
  return [input_list[i:i+slice_size] for i in range(0,len(input_list),slice_size)]


for item in tqdm(pages_and_texts):
  item['sentence_chunks']=split_list(input_list=item['sentences'],
                                     slice_size=num_sentence_chunk_size)
  item['num_chunks']=len(item['sentence_chunks'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [12]:
# convert each chunk into its own item
# here each element of chunk_dict will be a chunk having 10 sentences corresponding to that chunk.

import re

pages_and_chunks=[]
for item in tqdm(pages_and_texts):
  for sentence_chunk in item['sentence_chunks']:
    chunk_dict={}
    chunk_dict['page_number']=item['page_number']

    joined_sentence_chunk="".join(sentence_chunk).replace(" "," ").strip()
    joined_sentence_chunk=re.sub(r'\.([A-Z])',r'.\1',joined_sentence_chunk)
    chunk_dict['sentence_chunk']=joined_sentence_chunk

    chunk_dict['chunk_char_count']=len(joined_sentence_chunk)
    chunk_dict['chunk_word_count']=len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict['chunk_token_count']=len(joined_sentence_chunk)/4
    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [13]:
random.sample(pages_and_chunks,k=1)

[{'page_number': 654,
  'sentence_chunk': 'Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been  excluded from this version of the text.You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=378  \xa0 654  |  Introduction',
  'chunk_char_count': 419,
  'chunk_word_count': 69,
  'chunk_token_count': 104.75}]

In [14]:
# average tokens in each chunk is ~186

df=pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,746.69,124.58,186.67
std,347.79,453.83,78.22,113.46
min,-41.0,14.0,4.0,3.5
25%,280.5,319.5,49.0,79.88
50%,586.0,761.0,128.0,190.25
75%,890.0,1131.0,189.0,282.75
max,1166.0,1863.0,409.0,465.75


In [15]:
# min_token_length=30
# for row in df[df['chunk_token_count']<=min_token_length].sample(5).iterrows():
#   print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

In [16]:
# remove chunks having token length less that 30
min_token_length=30
pages_and_chunks_over_min_token_len=df[df['chunk_token_count']>min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0}]

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cpu")
sentences=[
    "The Sentences Transformers library provides an easy and open source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your wauy to being AI Engineer."
]

embeddings=embedding_model.encode(sentences)
embeddings_dict=dict(zip(sentences,embeddings))


for sentence, embedding in embeddings_dict.items():
  print(f"Sentence: {sentence}")
  print(f"Embedding: {embedding}")
  print("")

In [18]:
# create embeddings for our chunks
# each chunk will have an emdedding of size 786(according to model used).

%%time

embedding_model.to("cpu")

for item in tqdm(pages_and_chunks_over_min_token_len):
  item["embedding"]=embedding_model.encode(item['sentence_chunk'])

  0%|          | 0/1684 [00:00<?, ?it/s]

CPU times: user 12min 3s, sys: 2.64 s, total: 12min 6s
Wall time: 12min 23s


In [19]:
# save embeddings in a csv file

text_chunks_and_embeddings_df=pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path="text_chunks_and_embeddings.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path,index=False)

In [20]:
text_chunks_and_embeddings_df_load=pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,[ 6.74242452e-02 9.02281180e-02 -5.09548141e-...
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,[ 5.52156493e-02 5.92139289e-02 -1.66167356e-...
2,-37,Contents Preface University of Hawai‘i at Mā...,796,144,199.0,[ 2.79801767e-02 3.39813679e-02 -2.06426792e-...
3,-36,Lifestyles and Nutrition University of Hawai‘...,975,176,243.75,[ 6.82566836e-02 3.81274410e-02 -8.46854039e-...
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,[ 3.30264606e-02 -8.49765539e-03 9.57160164e-...


# **Which embedding model to use?**
Depends on :
size of input: if you need embeddings for long sequences, choose model with large input capacity (gemini, qwen).
size of embedding vector: larger generally means better representation but require more compute/storage.
size of model: larger model generally gives better embeddings but require more computation power.
open or closed: open model lets u run them on ur hardware, closed models can be easily setup but require an API call to get embeddings.


# Where to store embeddings?
if you have small dataset, under 100,000 records, np.array or torch.tensor works fine, but if we have production systems we work with 100,000+ embeddings then we look into vector databases.

# Vector Databases
Vector databases provide faster access to closest record from database for a query.
it uses indexing techniques such as IVFFlat or HNSW.
IVFFlat - cluster vectors into lists and whichever cluster or query is nearest to we select that.
HNSW - creates a multi layered graph structure.

we have multiple vector database services such as pinecone, qdrant...
If you already have data in postgres then we can use pgvector.

In [21]:
# convert our embeddings into tensors.

import random
import torch
import numpy as np
import pandas as pd

device="cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df=pd.read_csv("text_chunks_and_embeddings.csv")

text_chunks_and_embedding_df['embedding']=text_chunks_and_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"),sep=" "))

pages_and_chunks=text_chunks_and_embedding_df.to_dict(orient='records')

embeddings=torch.tensor(np.array(text_chunks_and_embedding_df['embedding'].tolist()),dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1684, 768])

In [22]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,"[0.0674242452, 0.090228118, -0.00509548141, -0..."
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,"[0.0552156493, 0.0592139289, -0.0166167356, -0..."
2,-37,Contents Preface University of Hawai‘i at Mā...,796,144,199.0,"[0.0279801767, 0.0339813679, -0.0206426792, 0...."
3,-36,Lifestyles and Nutrition University of Hawai‘...,975,176,243.75,"[0.0682566836, 0.038127441, -0.00846854039, -0..."
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,"[0.0330264606, -0.00849765539, 0.00957160164, ..."


In [23]:
from sentence_transformers import util,SentenceTransformer

embedding_model=SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device=device)


In [24]:
# define a query .
# turn that query into embedding with same model as above.
# perform cosine similarity to get top k records from embeddings
query="macronutrients functions"
print(f"Query:{query}")

query_embedding=embedding_model.encode(query, convert_to_tensor=True)

from time import perf_counter as timer

start_time=timer()
dot_score=util.dot_score(a=query_embedding,b=embeddings)[0]
end_time=timer()

print(f"Time take to get socres on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

top_results_dot_product=torch.topk(dot_score,k=5)
top_results_dot_product

Query:macronutrients functions
Time take to get socres on 1684 embeddings: 0.02368 seconds.


torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0'),
indices=tensor([42, 47, 41, 51, 46], device='cuda:0'))

In [25]:

larger_embeddings=torch.randn(100*embeddings.shape[0],768).to(device)
print(f"Enbeddings shape: {larger_embeddings.shape}")

start_time=timer()
dot_score=util.dot_score(a=query_embedding,b=larger_embeddings)[0]
end_timer=timer()

print(f"Time take to get score on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} second.")

Enbeddings shape: torch.Size([168400, 768])
Time take to get score on 168400 embeddings: -1.46893 second.


In [26]:
# print the top k records matching our query from embeddings

import textwrap

def print_wrapped(text,wrap_length=80):
  wrapped_text=textwrap.fill(text,wrap_length)
  print(wrapped_text)

In [27]:
print(f"Query: '{query}'\n")
print("Results:")

for score,idx in zip(top_results_dot_product[0],top_results_dot_product[1]):
  print(f"Score: {score:.4f}")
  print("Text:")
  print_wrapped(pages_and_chunks[idx]['sentence_chunk'])
  print(f"Page number: {pages_and_chunks[idx]['page_number']}")
  print("\n")

Query: 'macronutrients functions'

Results:
Score: 0.6926
Text:
Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called
macronutrients.There are three classes of macronutrients:  carbohydrates,
lipids, and proteins.These can be metabolically  processed into cellular
energy.The energy from macronutrients  comes from their chemical bonds.This
chemical energy is  converted into cellular energy that is then utilized to
perform work,  allowing our bodies to conduct their basic functions.A unit of
measurement of food energy is the calorie.On nutrition food labels  the amount
given for “calories” is actually equivalent to each calorie  multiplied by one
thousand.A kilocalorie (one thousand calories,  denoted with a small “c”) is
synonymous with the “Calorie” (with a  capital “C”) on nutrition food
labels.Water is also a macronutrient in  the sense that you require a large
amount of it, but unlike the other  macronutrients, it does not yield calories.
Carbohydrates  Carbo

In [28]:
def retrieve_relevant_resources(query:str,
                                embeddings:torch.tensor,
                                model:SentenceTransformer=embedding_model,
                                n_resources_to_return:int=5,
                                print_time:bool=True):
  query_embedding=model.encode(query,convert_to_tensor=True)
  start_time=timer()
  dot_score=util.dot_score(query_embedding,embeddings)[0]
  end_time=timer()

  if print_time:
    print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")
  scores,indices=torch.topk(input=dot_score,k=n_resources_to_return)
  return score,indices


def print_top_results_and_scores(query:str,
                                 embedding:torch.tensor,
                                 pages_and_chunks:list[dict]=pages_and_chunks,
                                 n_resources_to_return:int=5):
  scores,indices=retrieve_relevant_resources(query=query,
                                             embedding=embedding,
                                             n_resources_to_return=n_resources_to_return)
  print(f"Query: '{query}'\n")
  print("Results:")

  for score,index in zip(scores,indices):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[index]['sentence_chunk'])
    print(f"Page number: {pages_and_chunks[index]['page_number']}")
    print("\n")

In [29]:
import torch
gpu_memory_bytes=torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb=round(gpu_memory_bytes/(2**30))
print(f"Available GPu memory: {gpu_memory_gb} GB")

Available GPu memory: 15 GB


In [30]:
if gpu_memory_gb<5.1:
  print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run gemma LLM directly without quantization.")
elif gpu_memory_gb<8.1:
  print(f"GPU memory: {gpu_memory_gb} | Recommended model : Gemma 2B in 4 bit precision")
  use_quantization_config=True
  mode_id="google/gemma-2b-it"
elif gpu_memory_gb<19:
  print(f"GPU memory: {gpu_memory_gb} | Recommended model : Gemma 2B in float16 or Gemma 7B in 4 bit precsion")
  use_quantization_config=False
  mode_id="google/gemma-2b-it"
elif gpu_memory_gb>19:
  print(f"GPU memory: {gpu_memory_gb} | Recommended model : Gemma 7B in 4 bit precsion")
  use_quantization_config=False
  mode_id="google/gemm-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {mode_id}")


GPU memory: 15 | Recommended model : Gemma 2B in float16 or Gemma 7B in 4 bit precsion
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [None]:
from huggingface_hub import login

login(token="YOUR-HUGGING-FACE-TOKEN")

In [32]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

from transformers import BitsAndBytesConfig
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

attn_implementation="sdpa"

print(f"[INFO] Using attention implementation: {attn_implementation}")

model_id=mode_id
print(f"[INFO] Using model_id: {model_id}")

tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model=AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                               torch_dtype=torch.float16,
                                               quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=False,
                                               attn_implementation=attn_implementation)

if not use_quantization_config:
  llm_model.to("cuda")

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [33]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

In [34]:
def get_model_num_params(model:torch.nn.Module):
  return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

In [35]:
def get_model_mem_size(model:torch.nn.Module):
  mem_params=sum([param.nelement() * param.element_size() for param in model.parameters()])
  mem_buffers=sum([buf.nelement()*buf.element_size() for buf in model.buffers()])

  model_mem_bytes=mem_params+mem_buffers
  model_mem_mb=model_mem_bytes/(1024**2)
  model_mem_gb=model_mem_bytes/(1024**3)

  return {"model_mem_bytes": model_mem_bytes,
          "model_mem_mb": round(model_mem_mb,2),
          "model_mem_gb": round(model_mem_gb,2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 5012345344, 'model_mem_mb': 4780.15, 'model_mem_gb': 4.67}

In [36]:
input_text="what are the macronutrients, and what roles do they play in human body?"
print(f"Input text: {input_text}")

dialogue_template=[
    {
        "role":"user",
        "content":input_text
    }
]

prompt=tokenizer.apply_chat_template(conversation=dialogue_template,
                                     tokenize=False,
                                     add_generation_prompt=True)

print(f"Prompt: (formatted):\n{prompt}")

Input text: what are the macronutrients, and what roles do they play in human body?
Prompt: (formatted):
<bos><start_of_turn>user
what are the macronutrients, and what roles do they play in human body?<end_of_turn>
<start_of_turn>model



In [37]:
%%time

input_ids=tokenizer(prompt,return_tensors="pt").to("cuda")
print(f"model input (tokenized):\n{input_ids}\n")

outputs=llm_model.generate(**input_ids, max_new_tokens=256)
print(f"Model output (tokens): \n {outputs[0]} \n")

model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   5049,    708,    573, 186809,
         184592, 235269,    578,   1212,  16065,    749,    984,   1554,    575,
           3515,   2971, 235336,    107,    108,    106,   2516,    108]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]], device='cuda:0')}

Model output (tokens): 
 tensor([     2,      2,    106,   1645,    108,   5049,    708,    573, 186809,
        184592, 235269,    578,   1212,  16065,    749,    984,   1554,    575,
          3515,   2971, 235336,    107,    108,    106,   2516,    108,  21404,
        235269,   1517, 235303, 235256,    476,  25497,    576,    573, 186809,
        184592,    578,   1024,  16065,    575,    573,   3515,   2971, 235292,
           109,    688,  12298,   1695, 184592,  66058,    109, 235287,   5231,
        156615,  56227,  66058,    108,    141, 235287,  34

In [38]:
output_decoded=tokenizer.decode(outputs[0])
print(f"Model Output (decoded): \n{output_decoded}\n")

Model Output (decoded): 
<bos><bos><start_of_turn>user
what are the macronutrients, and what roles do they play in human body?<end_of_turn>
<start_of_turn>model
Sure, here's a breakdown of the macronutrients and their roles in the human body:

**Macronutrients:**

* **Carbohydrates:**
    * Provide energy for the body's cells and tissues.
    * Carbohydrates are the primary source of energy for most cells.
    * Complex carbohydrates are those that take longer to digest, such as whole grains, fruits, and vegetables.
    * Simple carbohydrates are those that are quickly digested, such as sugar, starch, and lactose.

* **Proteins:**
    * Build and repair tissues, enzymes, and hormones.
    * Proteins are essential for immune function, hormone production, and tissue repair.
    * There are different types of proteins, each with specific functions.

* **Fats:**
    * Provide energy, insulation, and help absorb vitamins.
    * Healthy fats include olive oil, avocado, nuts, and seeds.
    *

In [39]:
gpt4_questions=[
    "What are the macrnutrients, and what roles do they play in human body",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in human body.",
    "What role does fibre plays in digestion? name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

manual_questions=[
    "How often should infants be breastfeeded?",
    "What are symptoms of pellagra",
    "How does saliva help with digestion",
    "What is the RDI for protein per day?",
    "Water soluble vitamins"
]

query_list=gpt4_questions+manual_questions

Retreival

In [40]:
import random
query=random.choice(query_list)

print(f"Query: {query}")

scores,indices=retrieve_relevant_resources(query=query, embeddings=embeddings)
scores,indices

Query: What are the macrnutrients, and what roles do they play in human body
[INFO] Time taken to get scores on 1684 embeddings: 0.00011 seconds.


(tensor(0.6473, device='cuda:0'),
 tensor([ 47,  52,  50,  41, 149], device='cuda:0'))

Augmentation

In [41]:
def prompt_formatter(query:str, context_items:list[dict])->str:
  """
  Augment query with text based context from context_items.
  """

  context="- "+"\n- ".join([items['sentence_chunk'] for items in context_items])

  base_prompt="""
    Based on the following context items, please answer the query.
    Give Yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    \nExample 1:
    Query: What are fat-soluble vitamins?
    Answer: Fat-soluble vitamins are vitamins that dissolve in fat and are absorbed along with dietary fats in your intestine. Instead of being flushed out quickly, they’re stored in the liver and fatty tissues, so your body can use them later.
            The fat-soluble vitamins (remember: A, D, E, K)
            1. Vitamin A
            Role: Vision (especially night vision), immune system, skin health
            Sources: Carrots, sweet potatoes, spinach, milk, eggs, liver
            2. Vitamin D
            Role: Calcium absorption, strong bones & teeth, immune support
            Sources: Sunlight , fish, egg yolk, fortified milk
            3. Vitamin E
            Role: Antioxidant (protects cells from damage)
            Sources: Nuts, seeds, vegetable oils, green leafy vegetables
            4. Vitamin K
            Role: Blood clotting and bone health
            Sources: Green leafy vegetables (spinach, kale), broccoli
            Key characteristics (high-yield points)
            Absorbed with fats
            Stored in liver & adipose tissue
            Deficiency develops slowly
            Overdose is possible if taken in excess (unlike water-soluble vitamins)
    \nExmample 2:
    Query: What are cause of type 2 diabetes?
    Answer: Type 2 diabetes happens when the body can’t use insulin properly (insulin resistance) and, over time, doesn’t make enough insulin to keep blood sugar normal.
          Main causes of Type 2 Diabetes
          1. Insulin resistance (core problem)
          Body cells (muscle, fat, liver) stop responding well to insulin
          Glucose stays in the blood instead of entering cells
          Pancreas tries to compensate → eventually gets exhausted
          2. Genetic predisposition
          Strong family history
          Certain populations (including South Asians) have higher risk even at lower BMI
          3. Obesity (especially abdominal fat)
          Excess visceral fat releases inflammatory chemicals
          These interfere with insulin signaling
          Waist fat > overall weight is more dangerous
          4. Physical inactivity
          Muscles are major glucose users
          Less movement → more insulin resistance
          5. Unhealthy diet
          High refined carbs & sugar
          Processed foods, sugary drinks
          Low fiber intake
          6. Age
          Risk increases after 35–40 years
          But now rising in younger people due to lifestyle factors
          7. Chronic stress & poor sleep
          Stress hormones (cortisol) raise blood sugar
          Sleep deprivation worsens insulin sensitivity
          8. Hormonal & medical conditions
          PCOS
          Cushing’s syndrome
          Fatty liver disease
          Long-term steroid use
          9. Smoking & alcohol (excess)
          Smoking increases insulin resistance
          Heavy alcohol damages pancreas

      Now use the following context items to answer the user query:
      {context}. The context contains the answer. Look carefully and extract relevant information.
      \n Relevant passages: <extract relevant passages from the context here?
      User Query: {query}
      Answer: """

  print("dvfdgf")
  base_prompt=base_prompt.format(context=context, query=query)
  print("fdf")
  print(base_prompt)

  dialogue_template=[
      {
          "role":"user",
          "content":base_prompt
      }
  ]

  prompt=tokenizer.apply_chat_template(conversation=dialogue_template,
                                      tokenize=False,
                                      ad_generation_prompt=True)
  return prompt

In [42]:
query=random.choice(query_list)
print(f"Query: {query}")

scores,indices=retrieve_relevant_resources(query=query,embeddings=embeddings)

context_items=[pages_and_chunks[i] for i in indices]

prompt=prompt_formatter(query=query,context_items=context_items)
print(f"Prompt: \n{prompt}")

Query: Describe the process of digestion and absorption of nutrients in human body.
[INFO] Time taken to get scores on 1684 embeddings: 0.00011 seconds.
dvfdgf
fdf

    Based on the following context items, please answer the query.
    Give Yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example 1:
    Query: What are fat-soluble vitamins?
    Answer: Fat-soluble vitamins are vitamins that dissolve in fat and are absorbed along with dietary fats in your intestine. Instead of being flushed out quickly, they’re stored in the liver and fatty tissues, so your body can use them later.
            The fat-soluble vitamins (remember: A, D, E, K)
            1. Vitamin A
            Role: Vision (especially night vision), immune system, skin health
      

In [43]:
%%time

input_ids=tokenizer(prompt,return_tensors="pt").to("cuda")

outputs=llm_model.generate(**input_ids,
                           temperature=0.7,
                           do_sample=True,
                           max_new_tokens=256)

output_text=tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt,'')}")

Query: Describe the process of digestion and absorption of nutrients in human body.
RAG answer:
<bos>Sure, here are the relevant passages from the context:

- The process of digestion begins even before you put food into your mouth. When you feel hungry, your body sends a message to your brain that it is time to eat.

- The digestive system functions on two levels, mechanically to move and mix ingested food and chemically to break down large molecules.

- The small nutrient molecules can then be absorbed and processed by cells throughout the body for energy or used as building blocks for new cells.

- The digestive system is one of the eleven organ systems of the human body, and it is composed of several hollow tube-shaped organs including the mouth, pharynx, esophagus, stomach, small intestine, large intestine (colon), rectum, and anus.

- The mouth, where the second step of digestion starts, the mechanical and chemical breakdown of food begins.

- The chemical breakdown of food invol

In [44]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):

  # get scores and indices of top related results
  scores, indices=retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

  # create list of context items
  context_items=[pages_and_chunks[i] for i in indices]
  #print(context_items)

  # add score to context items
  # for i, item in enumerate(context_items):
  #   print(item)
  #   item["score"]=scores[i].cpu()

  # format prompt with context items
  prompt=prompt_formatter(query=query,context_items=context_items)

  # tokenize the prompt
  input_ids=tokenizer(prompt,return_tensors="pt").to("cuda")

  # generate an output of tokens
  outputs=llm_model.generate(**input_ids,temperature=temperature,do_sample=True,max_new_tokens=max_new_tokens)

  # turn the output tokens to text
  output_text=tokenizer.decode(outputs[0])

  if format_answer_text:
    output_text=output_text.replace(prompt,'').replace("<bos>"," ").replace("<eos>"," ").replace("Sure, here is the answer for the query:\n\n"," ")

  # only return the answer without context items
  if return_answer_only:
    return output_text

  return output_text,context_items

In [45]:
query=random.choice(query_list)
print(f"Query: {query}")

answer,context_items=ask(query=query, temperature=0.7,max_new_tokens=512,return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

Query: Describe the process of digestion and absorption of nutrients in human body.
[INFO] Time taken to get scores on 1684 embeddings: 0.00010 seconds.
dvfdgf
fdf

    Based on the following context items, please answer the query.
    Give Yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example 1:
    Query: What are fat-soluble vitamins?
    Answer: Fat-soluble vitamins are vitamins that dissolve in fat and are absorbed along with dietary fats in your intestine. Instead of being flushed out quickly, they’re stored in the liver and fatty tissues, so your body can use them later.
            The fat-soluble vitamins (remember: A, D, E, K)
            1. Vitamin A
            Role: Vision (especially night vision), immune system, skin health
      

[{'page_number': 60,
  'sentence_chunk': 'all other organ systems in the human body.We will learn the  process of nutrient digestion and absorption, which further  reiterates the importance of developing a healthy diet to maintain  a healthier you.The evidence abounds that food can indeed be “thy  medicine.” Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document). Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been  excluded from this version of the text.You can  view it online here:  http://pressbooks.oer.

In [46]:
!pip install -q ragas datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/457.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.8/84.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [47]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    answer_relevancy,
    faithfulness
)

try:
  from ragas.metrics import context_entity_recall
except ImportError:
  context_entity_recall=None
  print("context_entity_recall not available in this version")

try:
  from ragas.metrics import noise_robustness
except ImportError:
  noise_robustness=None
  print("noise_robustness not available in this version")

noise_robustness not available in this version


  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import context_entity_recall


In [48]:
eval_questions=[
    "How often should infants be breastfeeded?",
    "What are symptoms of pellagra",
    "How does saliva help with digestion?",
    "What are the recommended protein intake per day, based on your weight",
    "What are micronutrients?"
]

ground_truth_answers=[
    "Breastfed on demand, typically 8–12 times per day during the first months of life.",
    "The textbook defines pellagra as niacin (vitamin B3) deficiency and states that its symptoms include:Fatigue,Decreased appetite,Indigestion,Diarrhea,Dermatitis,Dementia"
    "Saliva aids digestion by lubricating food, forming a bolus for swallowing, and initiating carbohydrate digestion through the enzyme salivary amylase, which begins starch breakdown in the mouth."
    """
    0.8 grams of protein per kilogram of body weight per day
    How to calculate (as implied in the textbook)
    Daily protein intake (g/day)
    =
    0.8
    ×
    body weight (kg)
    Daily protein intake (g/day)=0.8×body weight (kg)
    Examples
    50 kg person → 40 g/day
    60 kg person → 48 g/day
    70 kg person → 56 g/day
    """,
    "Micronutrients are nutrients required by the body in very small amounts, but they are essential for normal growth, development, and the maintenance of health."
]



In [49]:
def generate_rag_answer(query):

  scores,indices=retrieve_relevant_resources(query=query,embeddings=embeddings)

  context_items=[pages_and_chunks[i] for i in indices]

  prompt=prompt_formatter(query=query,context_items=context_items)

  input_ids=tokenizer(prompt,return_tensors="pt").to("cuda")

  outputs=llm_model.generate(**input_ids,
                             temperature=0.7,
                             do_sample=True,
                             max_new_tokens=256)

  output_texts=tokenizer.decode(outputs[0])
  answer=output_texts.replace(prompt,'').strip()

  contexts=[item["sentence_chunk"] for item in context_items]

  return answer,contexts

In [50]:
evaluation_data=[]

print("Generating RAG answers for evaluation...")
for question,ground_truth in zip(eval_questions,ground_truth_answers):
  print(f"Processing: {question[:50]}...")

  rag_answer,contexts=generate_rag_answer(question)

  evaluation_data.append({
      "question":question,
      "answer":rag_answer,
      "contexts":contexts,
      "ground_truth":ground_truth
  })

Generating RAG answers for evaluation...
Processing: How often should infants be breastfeeded?...
[INFO] Time taken to get scores on 1684 embeddings: 0.00011 seconds.
dvfdgf
fdf

    Based on the following context items, please answer the query.
    Give Yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example 1:
    Query: What are fat-soluble vitamins?
    Answer: Fat-soluble vitamins are vitamins that dissolve in fat and are absorbed along with dietary fats in your intestine. Instead of being flushed out quickly, they’re stored in the liver and fatty tissues, so your body can use them later.
            The fat-soluble vitamins (remember: A, D, E, K)
            1. Vitamin A
            Role: Vision (especially night vision), immune system, skin

In [None]:
os.environ["OPENAI_API_KEY"]="YOUR-KEY"

In [52]:
eval_dataset=Dataset.from_pandas(pd.DataFrame(evaluation_data))

metrics=[
    context_precision,
    context_recall,

]

if context_entity_recall is not None:
  metrics.append(context_entity_recall)
if noise_robustness is not None:
  metrics.append(noise_robustness)

print("Running RAGAS evaluation...")
results=evaluate(dataset=eval_dataset,metrics=metrics)

results_df=results.to_pandas()

print("\n"+"="*80)
print("                        RAG Evaluation Results")
print("="*80)

Running RAGAS evaluation...


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[0]: InstructorRetryException(<failed_attempts>

<generation number="1">
<exception>
    Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
</exception>
<completion>
    None
</completion>
</generation>

<generation number="2">
<exception>
    Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
</exception>
<completion>
    None
</completion>
</generation>

<generation number="3">
<exception>
    Error code: 429 - {'error': {'message


                        RAG Evaluation Results
