In [3]:
!pip install PyMuPDF
!pip install tqdm


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [4]:
pdf_path = "human-nutrition-text.pdf"

In [5]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text:str) -> str:
  """Perform minor formatting on text."""
  cleaned_text = text.replace("\n", " ").strip()
  return cleaned_text
def open_and_read_pdf(pdf_path: str) -> list[dict]:
  """Open and read a PDF file."""
  doc = fitz.open(pdf_path)
  pages_and_texts = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text=text)
    pages_and_texts.append({"page_number": page_number - 41,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count": len(text.split(".")),
                            "page_token_count": len(text)/4,
                            "text": text,
                            })

  return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [6]:
import random
random.sample(pages_and_texts, k=4)

[{'page_number': 165,
  'page_char_count': 1656,
  'page_word_count': 285,
  'page_sentence_count': 13,
  'page_token_count': 414.0,
  'text': 'Percentage Food Item  90–99  Nonfat milk, cantaloupe, strawberries, watermelon, lettuce,  cabbage, celery, spinach, squash  80–89  Fruit juice, yogurt, apples, grapes, oranges, carrots,  broccoli, pears, pineapple  70–79  Bananas, avocados, cottage cheese, ricotta cheese, baked  potato, shrimp  60–69  Pasta, legumes, salmon, chicken breast  50–59  Ground beef, hot dogs, steak, feta cheese  40–49  Pizza  30–39  Cheddar cheese, bagels, bread  20–29  Pepperoni, cake, biscuits  10–19  Butter, margarine, raisins  1–9  Walnuts, dry-roasted peanuts, crackers, cereals, pretzels,  peanut butter  0  Oils, sugars  Source: National Nutrient Database for Standard Reference, Release  23. US Department of Agriculture, Agricultural Research Service.  http:/ /www.ars.usda.gov/ba/bhnrc/ndl. Updated 2010. Accessed  September 2017.  There is some debate over the a

### Text Processing

Splitting pages into sentences specifically 10 sentencess
# Using spaCy

In [7]:
!pip install spacy



In [8]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe('sentencizer')

doc = nlp("This is a sentence. this another sentence. I like Elephants")
assert len(list(doc.sents))==3

list(doc.sents)

[This is a sentence., this another sentence., I like Elephants]

In [9]:
for item in tqdm(pages_and_texts):
  item["sentences"] = list(nlp(item["text"]).sents)

  #check if all sentences are string or not
  item["sentences"] = [str(sentence) for sentence in item["sentences"]]

  item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1163,
  'page_char_count': 1617,
  'page_word_count': 254,
  'page_sentence_count': 22,
  'page_token_count': 404.25,
  'text': 'Images\xa0/ Pixabay License; “Pumpkin Cartoon Orange” by  Clker-Free-Vector-Images\xa0/ Pixabay License; “Courgette  Zuchinni Curcubit” by Clker-Free-Vector-Images\xa0/ Pixabay  License; “Egg Hard Boiled Sliced” by Clker-Free-Vector- Images\xa0/ Pixabay License\xa0“Raisins Box Sweet” by jondometita\xa0/  Pixabay License  56. Exercise 12.1 reused “Socio-Ecological Model” by Allison  Calabrese  57. Exercise 12.2 reused “My Plate” by US Department of  Agriculture; “Healthy Eating in the Pacific” by Secretariat of the  Pacific Community (SPC) Guidelines; “Pacific Food Guide” by  Children’s Healthy Living Program  58. Exercise 13.1 reused “Baby” by Marie Kainoa Fialkowski Revilla;\xa0 “Birthday Boy Cake” by Kazuend” / Unsplash License  59. Exercise 14.1 reused Brytni K-aloha / CC BY 4.0; “Teenager  Playing American Football” by Riley McCullough / 

In [11]:
import pandas as pd

In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,14.18,287.15,10.32
std,348.86,560.44,95.75,9.54,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,8.0,190.69,5.0
50%,562.5,1232.5,215.0,13.0,308.12,10.0
75%,864.25,1605.25,271.25,19.0,401.31,15.0
max,1166.0,2308.0,429.0,82.0,577.0,28.0


In [13]:
 #define split size to turn groups of sentence into chunks
 num_sentences_chunk_size = 10

 #creating a function to split lists of texts recursively into chunk size
 #e.g [30] -> [10,10,10]
 def split_list(input_list: list[str], slice_size: int = num_sentences_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]
 test_list = list(range(25))
 split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [14]:
for item in tqdm(pages_and_texts):
  item['sentence_chunks'] = split_list(input_list=item['sentences'],
                                             slice_size = num_sentences_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [15]:
random.sample(pages_and_texts, k=1)

[{'page_number': 315,
  'page_char_count': 1382,
  'page_word_count': 241,
  'page_sentence_count': 13,
  'page_token_count': 345.5,
  'text': 'Image by  Allison  Calabrese /  CC BY 4.0  layer in cell membranes, thus effectively protecting the inside of the  cell from the outside environment while at the same time allowing  for transport of fat and water through the membrane.  Figure 5.7 The Structure of a Phospholipid  Phospholipids are ideal emulsifiers that can keep oil and water  mixed. Emulsions are mixtures of two liquids that do not mix.  Without emulsifiers, the fat and water content would be somewhat  separate within food. Lecithin (phosphatidylcholine), found in egg  yolk, honey, and mustard, is a popular food emulsifier. Mayonnaise  demonstrates lecithin’s ability to blend vinegar and oil to create the  stable, spreadable condiment that so many enjoy. Food emulsifiers  play an important role in making the appearance of food appetizing.  Adding emulsifiers to sauces and cream

In [16]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,14.18,287.15,10.32,1.53
std,348.86,560.44,95.75,9.54,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,8.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,13.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,19.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,82.0,577.0,28.0,3.0


# Spilitting each chunk into its own item
We will embed each chunk of sentences into its own numerical representation

In [17]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_texts):
  for sentence_chunk in item["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]

    #join the sentences together into a paragraph-like structure aka join list of sentences into one paragraph
    joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    #Get Some Stats on our chunks
    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = -4 chars

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)



  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [18]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 743,
  'sentence_chunk': 'Nutrients?”visit, http:/ /www.gotnutrients.net. To receive  the “Daily Tips” by email, visit  http:/ /www.gotnutrients.net/email_alerts/subscribe.cfm  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document). Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome. \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  Building Healthy Eating Patterns  |  743',
  'chunk_char_count': 854,
  'chunk_word_count': 133,
  'chunk_token_count': 213.5}]

In [19]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,750.4,128.29,187.6
std,347.79,455.59,79.84,113.9
min,-41.0,14.0,4.0,3.5
25%,280.5,321.5,51.5,80.38
50%,586.0,764.0,131.0,191.0
75%,890.0,1138.0,194.0,284.5
max,1166.0,1870.0,412.0,467.5


In [20]:
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 54,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 32,
  'chunk_token_count': 53.0}]

In [21]:
## 1.56 has been done

# Embedding our texts chunks
Numerical representation of our texts

In [22]:
!pip install sentence-transformers # for embedding models

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [23]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
#create a list of sentences
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981113e-02  3.03164795e-02 -2.01218147e-02  6.86483756e-02
 -2.55255271e-02 -8.47689621e-03 -2.07084100e-04 -6.32377341e-02
  2.81606186e-02 -3.33352946e-02  3.02635301e-02  5.30720539e-02
 -5.03526032e-02  2.62288153e-02  3.33314314e-02 -4.51578870e-02
  3.63044329e-02 -1.37113058e-03 -1.20171346e-02  1.14946300e-02
  5.04510924e-02  4.70857024e-02  2.11912952e-02  5.14607318e-02
 -2.03746632e-02 -3.58889513e-02 -6.67892222e-04 -2.94393133e-02
  4.95858938e-02 -1.05639584e-02 -1.52014289e-02 -1.31752621e-03
  4.48197350e-02  1.56022953e-02  8.60380283e-07 -1.21392391e-03
 -2.37978548e-02 -9.09427938e-04  7.34480796e-03 -2.53931968e-03
  5.23369759e-02 -4.68043573e-02  1.66214537e-02  4.71578874e-02
 -4.15599234e-02  9.01929627e-04  3.60278897e-02  3.42214443e-02
  9.68227461e-02  5.94828576e-02 -1.64984558e-02 -3.51249650e-02
  5.92514267e-03 -7.08006672e-04 -2.4103

In [25]:
%%time

# running embedding model on gpu
embedding_model.to("cuda")
for item in tqdm(pages_and_chunks_over_min_token_len):
  item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1685 [00:00<?, ?it/s]

CPU times: user 35.4 s, sys: 250 ms, total: 35.7 s
Wall time: 48.3 s


In [26]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [27]:
%%time

#embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)
text_chunk_embeddings

CPU times: user 26.9 s, sys: 64.7 ms, total: 26.9 s
Wall time: 26.6 s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

# save embedding to a file

In [28]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [29]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.0,[ 6.74242675e-02 9.02281404e-02 -5.09548886e-...
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.0,[ 5.52156419e-02 5.92139773e-02 -1.66167244e-...
2,-37,Contents Preface University of Hawai‘i at Mā...,797,145,199.25,[ 2.79801842e-02 3.39813754e-02 -2.06426680e-...
3,-36,Lifestyles and Nutrition University of Hawai‘...,976,177,244.0,[ 6.82566911e-02 3.81275006e-02 -8.46854132e-...
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,[ 3.30264494e-02 -8.49763490e-03 9.57159605e-...


# RAG SEARCH AND ANSWERS

RAG Goal: Retrieve relevant passages based on query and use those passages

In [30]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

#import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

#convert embedding from string to array
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

#convert to torch tensors
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to("cuda")

#convert text and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,80.00,"[0.06742427, 0.09022814, -0.005095489, -0.0317..."
1,-38,Human Nutrition: 2020 Edition by University of...,212,32,53.00,"[0.05521564, 0.059213977, -0.016616724, -0.020..."
2,-37,Contents Preface University of Hawai‘i at Mā...,797,145,199.25,"[0.027980184, 0.033981375, -0.020642668, 0.001..."
3,-36,Lifestyles and Nutrition University of Hawai‘...,976,177,244.00,"[0.06825669, 0.0381275, -0.008468541, -0.01813..."
4,-35,The Cardiovascular System University of Hawai...,1037,191,259.25,"[0.03302645, -0.008497635, 0.009571596, -0.004..."
...,...,...,...,...,...,...
1680,1164,Flashcard Images Note: Most images in the fla...,1329,200,332.25,"[0.018562254, -0.016427767, -0.012704563, -0.0..."
1681,1164,Hazard Analysis Critical Control Points reused...,383,59,95.75,"[0.03347206, -0.057044085, 0.015148939, -0.010..."
1682,1165,ShareAlike 11. Organs reused “Pancreas Organ ...,1312,199,328.00,"[0.07705155, 0.009785576, -0.012181741, 0.0010..."
1683,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,420,69,105.00,"[0.10304516, -0.016470186, 0.008268461, 0.0377..."


In [31]:

embeddings

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

In [32]:
embeddings.shape

torch.Size([1685, 768])

In [33]:
#create Model
# !Again
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)



# Embedding model Ready!

Lets Create a Semantic Pipeline
Means search for relavant text and it will find it in document  

In [34]:
# query

# query = "macronutrients functions"
# print(f"Query: {query}")

# query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

# # Get Similarity score with dot product
# from time import perf_counter as timer

# start_time = timer()
# dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
# end_time = timer()
# print(f"[Info] Time taken to get scores {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# # top-k results
# top_results_dot_product = torch.topk(dot_scores, k=5)
# top_results_dot_product


In [35]:
#pages_and_chunks[42]

In [36]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [37]:

# print(f"Query: '{query}'\n")
# print("Results:")
# # Loop through zipped together scores and indicies from torch.topk
# for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
#     print(f"Score: {score:.4f}")
#     # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
#     print("Text:")
#     print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
#     # Print the page number too so we can reference the textbook further (and check the results)
#     print(f"Page number: {pages_and_chunks[idx]['page_number']}")
#     print("\n")


In [38]:

# import fitz

# # Open PDF and load target page
# pdf_path = "human-nutrition-text.pdf" # requires PDF to be downloaded
# doc = fitz.open(pdf_path)
# page = doc.load_page(5 + 41) # number of page (our doc starts page numbers on page 41)

# # Get the image of the page
# img = page.get_pixmap(dpi=300)

# # Optional: save the image
# #img.save("output_filename.png")
# doc.close()

# # Convert the Pixmap to a numpy array
# img_array = np.frombuffer(img.samples_mv,
#                           dtype=np.uint8).reshape((img.h, img.w, img.n))

# # Display the image using Matplotlib
# import matplotlib.pyplot as plt
# plt.figure(figsize=(13, 10))
# plt.imshow(img_array)
# plt.title(f"Query: '{query}' | Most relevant page:")
# plt.axis('off') # Turn off axis
# plt.show()

## similarity measures
#### dot_product or cosine similarity

In [39]:
# import torch

# def dot_product(vector1, vector2):
#   return torch.dot(vector1, vector2)

# def cosine_similarity(vector1, vector2):
#   dot = torch.dot(vector1, vector2)
#   norm1 = torch.sqrt(torch.sum(vector1**2))
#   norm2 = torch.sqrt(torch.sum(vector2**2))
#   return dot/ (norm1*norm2)


In [40]:
# # Example tensors
# vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
# vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# # Calculate dot product
# print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
# print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
# print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# # Calculate cosine similarity
# print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
# print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
# print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

In [41]:
from time import perf_counter as timer


In [42]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [43]:
# #query = "symptoms of pellagra"
#  Get just the scores and indices of top related results
# scores, indices = retrieve_relevant_resources(query=query,
#                                               embeddings=embeddings)
# scores, indices


In [44]:
# # Print out the texts of the top scores
# print_top_results_and_scores(query=query,
#                              embeddings=embeddings)

In [45]:
# # Getting an LLM for local developement
# !nvidia-smi

In [46]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"available memory: {gpu_memory_gb}")

available memory: 15


In [47]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


# loading a LLM Locally
We can Load an LLM locally using Hugging face ```transformers```
using gemma-2b-it

1. Qunatization(Optional)
2. A model ID - this will tell which tokenizer to use
3. A tokenizer - this turns text into numbers ready for llm
4. An LLM model - this will be what we use to generate text based on an input

In [48]:
torch.cuda.get_device_capability(0)

(7, 5)

In [49]:
pip install --upgrade huggingface_hub



In [51]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [52]:
!git config --global credential.helper store

In [53]:
!pip install bitsandbytes accelerate
# don't have option for flash-attn on colab because of t4 gpu have cuda version less than 8.0
!pip install flash-attn

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes, accelerate
Successfully installed accelerate-0.31.0 bitsandbytes-0.43.1
Collecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.9 MB/s[0m eta [36

In [54]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
model_id = "google/gemma-2b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [55]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [56]:
def get_model_num_params(model: torch.nn.Module):
  return sum([param.numel() for param in model.parameters()])
get_model_num_params(llm_model)

2506172416

# Generating text from llm

In [57]:
# input_text = "What are MicroNutrient, and what roles do they play in human body"
# print(f"input text: \n {input_text} ")

# #create prompt template for instruction-tuned model

# dialogue_temp = [
#     {
#         "role": "user",
#         "content": input_text
#     }
# ]

# prompt = tokenizer.apply_chat_template(conversation=dialogue_temp, tokenize=False, add_generation_prompt=True)

# print(f"prompt: (formatted) \n {prompt}")

In [58]:
# %%time

# input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# #Generate Outputs

# outputs = llm_model.generate(**input_ids,
#     max_new_tokens=256)
# print(f"model output (tokens):\n{outputs[0]}\n")

In [59]:
# #decode the output token to text
# output_decoded = tokenizer.decode(outputs[0])
# print(f"model output (text):\n{output_decoded}")

In [60]:
print("helo")

helo


In [61]:
# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

# Manually created question list
manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]

query_list = gpt4_questions + manual_questions

In [62]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: How often should infants be breastfed?
[INFO] Time taken to get scores on 1685 embeddings: 0.03225 seconds.


(tensor([0.6205, 0.6067, 0.5696, 0.5624, 0.5307], device='cuda:0'),
 tensor([1155, 1164, 1148, 1142, 1159], device='cuda:0'))

# augmenting our prompt with context items


In [78]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
                     context = "-" + "\n- ".join([item["sentence_chunk"] for item in context_items])
                     base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
                     base_prompt = base_prompt.format(context=context, query=query)
                     #create  prompt template
                     dialoge_template = [
                         {"role": "user",
                          "content": base_prompt}
                     ]
                     prompt = tokenizer.apply_chat_template(conversation=dialoge_template, tokenize=False, add_generation_prompt=True)
                     return prompt


query = random.choice(query_list)
print(f"Query: {query}")

#Get Relavant Scode
scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings)

#create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query, context_items=context_items)
print(prompt)


Query: What are the macronutrients, and what roles do they play in the human body?
[INFO] Time taken to get scores on 1685 embeddings: 0.00008 seconds.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin 

In [79]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

#generate output of tokens
outputs = llm_model.generate(**input_ids,
                              temperature=0.7,
                             do_sample=True,
                             max_new_tokens=256
                             )
#turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"Rag Answer: \m{output_text.replace(prompt, '')}")


Query: What are the macronutrients, and what roles do they play in the human body?
Rag Answer: \m<bos>Sure, here are the relevant passages from the context that answer the user's query:

**Macronutrients**

* Carbohydrates: Provide energy for cellular functions, serve as structural components for cells, and help regulate blood sugar levels.
* Lipids: Provide stored energy for the body, function as structural components of cells, and act as signaling molecules.
* Proteins: Are necessary for tissue formation, cell repair, and hormone and enzyme production.

These macronutrients contribute to the body's energy production, cellular structure, and overall health.<eos>
CPU times: user 4.1 s, sys: 6.12 ms, total: 4.11 s
Wall time: 4.11 s


In [88]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [90]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)



Query: What are the macronutrients, and what roles do they play in the human body?
[INFO] Time taken to get scores on 1685 embeddings: 0.00010 seconds.
Answer:

Sure, here are the relevant passages from the context:  **Macronutrients**  -
Carbohydrates are molecules composed of carbon, hydrogen, and oxygen. - Lipids
are molecules composed of carbon, hydrogen, and oxygen. - Proteins are
macromolecules composed of chains of subunits called amino acids.  **Water** -
Water is a macronutrient in the sense that you require a large amount of it, but
unlike the other macronutrients, it does not yield calories. - Water does not
contain carbon, but is composed of two hydrogen and one oxygen per molecule of
water. - More than 60 percent of your total body weight is water.
