In [1]:
# print("[INFO] Running in VSCODE, installing requirements.")
# !pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
# !pip install PyMuPDF 
# !pip install tqdm
# !pip install sentence-transformers 
# !pip install accelerate 
# !pip install bitsandbytes 
# !pip install flash-attn --no-build-isolation 
# !pip install spacy
# !pip install pandas

In [2]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 0,  # adjust page numbers since our PDF starts on page 0
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pdf_path = "harrypotter.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
3623it [00:03, 1087.77it/s]


[{'page_number': 0,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': 1,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [3]:
# pages_and_texts[:][12]['text']

In [4]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 1493,
  'page_char_count': 2276,
  'page_word_count': 457,
  'page_sentence_count_raw': 64,
  'page_token_count': 569.0,
  'text': 'Voldemort let the silence spiral horribly before continuing. “Only one power remained to me. I could possess the bodies of others. But I dared not go where other humans were plentiful, for I knew that the Aurors were still abroad and searching for me. I sometimes inhabited animals — snakes, of course, being my preference — but I was little better off inside them than as pure spirit, for their bodies were ill adapted to perform magic . . . and my possession of them shortened their lives; none of them lasted long. . . . “Then . . . four years ago . . . the means for my return seemed assured. A wizard — young, foolish, and gullible — wandered across my path in the forest I had made my home. Oh, he seemed the very chance I had been dreaming of . . . for he was a teacher at Dumbledore’s school . . . he was easy to bend to my will . . . he broug

In [5]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,0,1,1,0.0,
1,1,0,1,1,0.0,
2,2,0,1,1,0.0,
3,3,0,1,1,0.0,
4,4,0,1,1,0.0,


In [6]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,3623.0,3623.0,3623.0,3623.0,3623.0
mean,1811.0,1731.71,310.02,22.82,432.93
std,1046.01,393.21,71.15,10.75,98.3
min,0.0,0.0,1.0,1.0,0.0
25%,905.5,1635.5,290.5,16.0,408.88
50%,1811.0,1814.0,324.0,21.0,453.5
75%,2716.5,1965.0,351.0,27.0,491.25
max,3622.0,2432.0,463.0,90.0,608.0


In [7]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 3623/3623 [00:04<00:00, 726.83it/s]


In [9]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 1816,
  'page_char_count': 2230,
  'page_word_count': 375,
  'page_sentence_count_raw': 23,
  'page_token_count': 557.5,
  'text': 'homework ever. It was the same, if not worse, in Transfiguration. “You cannot pass an O.W.L.,” said Professor McGonagall grimly, “without serious application, practice, and study. I see no reason why everybody in this class should not achieve an O.W.L. in Transfiguration as long as they put in the work.” Neville made a sad little disbelieving noise. “Yes, you too, Longbottom,” said Professor McGonagall. “There’s nothing wrong with your work except lack of confidence. So . . . today we are starting Vanishing Spells. These are easier than Conjuring Spells, which you would not usually attempt until N.E.W.T. level, but they are still among the most difficult magic you will be tested on in your O.W.L.” She was quite right; Harry found the Vanishing Spells horribly difficult. By the end of a double period, neither he nor Ron had managed to vanis

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,3623.0,3623.0,3623.0,3623.0,3623.0,3623.0
mean,1811.0,1731.71,310.02,22.82,432.93,24.12
std,1046.01,393.21,71.15,10.75,98.3,7.58
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,905.5,1635.5,290.5,16.0,408.88,20.0
50%,1811.0,1814.0,324.0,21.0,453.5,25.0
75%,2716.5,1965.0,351.0,27.0,491.25,29.0
max,3622.0,2432.0,463.0,90.0,608.0,50.0


In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 3623/3623 [00:00<00:00, 278681.84it/s]


In [12]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 620,
  'page_char_count': 1810,
  'page_word_count': 317,
  'page_sentence_count_raw': 18,
  'page_token_count': 452.5,
  'text': 'Ron lifted Scabbers out of his inside pocket and placed him next to the cage of his fellow rats, who stopped their skipping tricks and scuffled to the wire for a better look. Like nearly everything Ron owned, Scabbers the rat was second-hand (he had once belonged to Ron’s brother Percy) and a bit battered. Next to the glossy rats in the cage, he looked especially woebegone. “Hm,” said the witch, picking up Scabbers. “How old is this rat?” “Dunno,” said Ron. “Quite old. He used to belong to my brother.” “What powers does he have?” said the witch, examining Scabbers closely. “Er —” The truth was that Scabbers had never shown the faintest trace of interesting powers. The witch’s eyes moved from Scabbers’s tattered left ear to his front paw, which had a toe missing, and tutted loudly. “He’s been through the mill, this one,” she said. “He was li

In [13]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,3623.0,3623.0,3623.0,3623.0,3623.0,3623.0,3623.0
mean,1811.0,1731.71,310.02,22.82,432.93,24.12,2.87
std,1046.01,393.21,71.15,10.75,98.3,7.58,0.79
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,905.5,1635.5,290.5,16.0,408.88,20.0,2.0
50%,1811.0,1814.0,324.0,21.0,453.5,25.0,3.0
75%,2716.5,1965.0,351.0,27.0,491.25,29.0,3.0
max,3622.0,2432.0,463.0,90.0,608.0,50.0,5.0


In [14]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 3623/3623 [00:00<00:00, 20675.37it/s]


10396

In [15]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 3557,
  'sentence_chunk': 'A frightened teenage boy is a danger to others as well as to himself. Offer him help and guidance, he ought to accept, he likes you —” “— much less since his father has lost favor. Draco blames me, he thinks I have usurped Lucius’s position.” “All the same, try. I am concerned less for myself than for accidental victims of whatever schemes might occur to the boy. Ultimately, of course, there is only one thing to be done if we are to save him from Lord',
  'chunk_char_count': 455,
  'chunk_word_count': 87,
  'chunk_token_count': 113.75}]

In [16]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,10396.0,10396.0,10396.0,10396.0
mean,1793.17,601.16,106.35,150.29
std,1045.34,306.58,54.11,76.65
min,5.0,1.0,1.0,0.25
25%,887.0,400.0,71.0,100.0
50%,1759.5,579.0,103.0,144.75
75%,2699.0,779.0,138.0,194.75
max,3622.0,2336.0,399.0,584.0


In [17]:
# Show random chunks with under 30 tokens in length
min_token_length = 0
for row in df[df["chunk_token_count"] <= min_token_length].iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

In [18]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 5,
  'sentence_chunk': 'CONTENTS Harry Potter and the Sorcerer’s Stone Harry Potter and the Chamber of Secrets Harry Potter and the Prisoner of Azkaban Harry Potter and the Goblet of Fire Harry Potter and the Order of the Phoenix Harry Potter and the Half-Blood Prince Harry Potter and the Deathly Hallows',
  'chunk_char_count': 281,
  'chunk_word_count': 48,
  'chunk_token_count': 70.25},
 {'page_number': 8,
  'sentence_chunk': 'FOR JESSICA, WHO LOVES STORIES, FOR ANNE, WHO LOVED THEM TOO; AND FOR DI, WHO HEARD THIS ONE FIRST.',
  'chunk_char_count': 99,
  'chunk_word_count': 19,
  'chunk_token_count': 24.75}]

In [19]:
# !pip install sentence-transformers
import torch
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cuda") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)
# Check if CUDA is available and set the device accordingly
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Print the device
print(f"Using device: {device}")



Using device: cuda


In [20]:
%%time

# Send the model to the GPU
embedding_model.to("cuda") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 10396/10396 [03:58<00:00, 43.65it/s]

CPU times: total: 11min 51s
Wall time: 3min 58s





In [21]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)
# , escapechar='\\'

In [22]:
len(text_chunks_and_embeddings_df)

10396

In [23]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,5,CONTENTS Harry Potter and the Sorcerer’s Stone...,281,48,70.25,[ 4.88150194e-02 3.13613489e-02 -1.05375061e-...
1,8,"FOR JESSICA, WHO LOVES STORIES, FOR ANNE, WHO ...",99,19,24.75,[ 1.68481730e-02 1.10594528e-02 -3.13987210e-...
2,9,CONTENTS ONE The Boy Who Lived TWO The Vanishi...,292,50,73.0,[ 4.38591167e-02 2.93046013e-02 -1.53083112e-...
3,10,The Mirror of Erised THIRTEEN Nicolas Flamel F...,176,26,44.0,[ 6.01046197e-02 5.54131642e-02 2.76092021e-...
4,11,M CHAPTER ONE THE BOY WHO LIVED r. and Mrs. D...,1289,230,322.25,[ 1.21132750e-02 1.85709447e-02 -2.47814637e-...


In [24]:
len(text_chunks_and_embedding_df_load['embedding'][0])

12480

In [25]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

cuda


torch.Size([10396, 768])

In [26]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,5,CONTENTS Harry Potter and the Sorcerer’s Stone...,281,48,70.25,"[0.0488150194, 0.0313613489, -0.0105375061, 0...."
1,8,"FOR JESSICA, WHO LOVES STORIES, FOR ANNE, WHO ...",99,19,24.75,"[0.016848173, 0.0110594528, -0.031398721, 0.00..."
2,9,CONTENTS ONE The Boy Who Lived TWO The Vanishi...,292,50,73.0,"[0.0438591167, 0.0293046013, -0.0153083112, 0...."
3,10,The Mirror of Erised THIRTEEN Nicolas Flamel F...,176,26,44.0,"[0.0601046197, 0.0554131642, 0.00276092021, 0...."
4,11,M CHAPTER ONE THE BOY WHO LIVED r. and Mrs. D...,1289,230,322.25,"[0.012113275, 0.0185709447, -0.0247814637, 0.0..."


In [27]:
embeddings[0]

tensor([ 4.8815e-02,  3.1361e-02, -1.0538e-02,  5.4614e-02, -3.0401e-02,
         6.9892e-02,  2.9673e-02, -3.7372e-03,  6.3727e-02, -1.1713e-02,
         1.8052e-02,  6.1999e-02,  5.3205e-02, -6.7199e-02,  8.7563e-02,
        -8.5908e-02,  2.5470e-02, -2.4550e-02, -8.8356e-02,  5.2592e-03,
        -3.6130e-02,  7.0244e-02, -2.2533e-02,  1.2999e-03, -5.0225e-03,
        -5.2841e-02, -2.7148e-02,  3.8182e-02, -3.0226e-03, -6.1530e-02,
         1.4622e-02, -6.0864e-02,  4.5739e-03,  5.1724e-02,  2.0775e-06,
        -2.4414e-02,  4.5324e-02, -1.4382e-03, -3.2586e-02, -2.8219e-02,
        -1.1079e-03,  4.9997e-02,  3.1503e-02, -6.7808e-02,  2.9233e-02,
        -2.5799e-02,  3.2630e-02,  5.9321e-02, -5.7071e-02,  1.9496e-02,
         1.6945e-02, -1.8168e-02, -4.3047e-02, -2.0095e-02,  9.7193e-02,
        -3.5849e-02, -1.8391e-02, -3.9925e-02,  2.7684e-02,  5.7206e-02,
         2.1256e-03,  4.3017e-03, -3.0411e-02, -3.7727e-02,  1.1235e-02,
         1.0682e-03, -2.7853e-03, -9.1708e-03, -5.7

In [28]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cuda") # choose the device to load the model to



In [29]:
# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "Mrs. Figg"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples 
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: Mrs. Figg
Time take to get scores on 10396 embeddings: 0.00212 seconds.


torch.return_types.topk(
values=tensor([0.5111, 0.4965, 0.4642, 0.4500, 0.4498], device='cuda:0'),
indices=tensor([5041, 5040, 4721, 5046, 5048], device='cuda:0'))

In [30]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [31]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'Mrs. Figg'

Results:
Score: 0.5111
Text:
Full name?”said Fudge loudly, when Mrs. Figg had perched herself nervously on
the very edge of her seat. “Arabella Doreen Figg,” said Mrs. Figg in her quavery
voice. “And who exactly are you?”said Fudge, in a bored and lofty voice. “I’m a
resident of Little Whinging, close to where Harry Potter lives,” said Mrs. Figg.
“We have no record of any witch or wizard living in Little Whinging other than
Harry Potter,” said Madam Bones at once. “That situation has always been closely
monitored, given . . .given past events.” “I’m a Squib,” said Mrs. Figg. “
Page number: 1705


Score: 0.4965
Text:
“Oh, very well, very well,” snapped Fudge. “Where is this person?” “I brought
her with me,” said Dumbledore. “She’s just outside the door. Should I — ?” “No —
Weasley, you go,” Fudge barked at Percy, who got up at once, hurried down the
stone steps from the judge’s balcony, and hastened past Dumbledore and Harry
without glancing at them. A moment later, 

In [32]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

In [82]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=10,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=10):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [34]:
query = "Ministry of Magic"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 10396 embeddings: 0.00008 seconds.


(tensor([0.4826, 0.4691, 0.4648, 0.4644, 0.4465], device='cuda:0'),
 tensor([4655, 9121, 6704, 7087, 3597], device='cuda:0'))

In [35]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 10396 embeddings: 0.00007 seconds.
Query: Ministry of Magic

Results:
Score: 0.4826
CONTENTS   ONE Dudley Demented  TWO A Peck of Owls  THREE The Advance Guard
FOUR Number Twelve, Grimmauld Place  FIVE The Order of the Phoenix  SIX The
Noble and Most Ancient House of Black  SEVEN The Ministry of Magic
Page number: 1564


Score: 0.4691
standing in water, his shoes, feet, and robes remained quite dry. He reached up,
pulled the chain, and next moment had zoomed down a short chute, emerging out of
a fireplace into the Ministry of Magic. He got up clumsily; there was a lot more
of his body than he was accustomed to. The great Atrium seemed darker than Harry
remembered it. Previously a golden fountain had filled the center of the hall,
casting shimmering spots of light over the polished wooden floor and walls. Now
a gigantic statue of black stone dominated the scene. It was rather frightening,
this vast sculpture of a witch and a wizard sitting on ornately 

In [36]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 4 GB


In [37]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 
from transformers import BitsAndBytesConfig
# Suppress symlink warning by setting environment variable
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

use_quantization_config = True
model_id = "google/gemma-2b-it"
# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_quant_type="nf4",
                                         llm_int8_enable_fp32_cpu_offload =True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                #  torch_dtype=torch.float16, # datatype to use, we want float16
                                                 device_map={"":0},
                                                 low_cpu_mem_usage = True,
                                                 trust_remote_code=True,
                                                #  load_in_8bit_fp32_cpu_offload=True,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                #  low_cpu_mem_usage=False, # use full memory 
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU 
    llm_model.to("cuda")

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]


In [38]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): G

In [39]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

1515268096

In [40]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2039641088, 'model_mem_mb': 1945.15, 'model_mem_gb': 1.9}

In [41]:
input_text = "how does dumbledore die?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
how does dumbledore die?

Prompt (formatted):
<bos><start_of_turn>user
how does dumbledore die?<end_of_turn>
<start_of_turn>model



In [42]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt")
print(f"Model input (tokenized):\n{input_ids}\n")
# llm_model.to("cuda")
# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig 
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   1139,   1721,    499, 129375,
           1303, 235336,    107,    108,    106,   2516,    108]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}



  attn_output = torch.nn.functional.scaled_dot_product_attention(


Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1139,   1721,    499, 129375,
          1303, 235336,    107,    108,    106,   2516,    108,   3493,    603,
           793,   5820,    689,   2113,    577,   2676,    573,   5035,    674,
        156202,   4734,    575,    573,  14140,  30961,   4100, 235265,      1])

CPU times: total: 5.69 s
Wall time: 1.92 s


In [43]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
how does dumbledore die?<end_of_turn>
<start_of_turn>model
There is no evidence or information to support the claim that Dumbledore dies in the Harry Potter series.<eos>



In [57]:
input_text = "who is the author of harry potter series?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
who is the author of harry potter series?

Prompt (formatted):
<bos><start_of_turn>user
who is the author of harry potter series?<end_of_turn>
<start_of_turn>model



In [58]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt")
print(f"Model input (tokenized):\n{input_ids}\n")
# llm_model.to("cuda")
# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig 
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,  10569,    603,    573,   3426,
            576,  46702,  43724,   4100, 235336,    107,    108,    106,   2516,
            108]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,  10569,    603,    573,   3426,
           576,  46702,  43724,   4100, 235336,    107,    108,    106,   2516,
           108,    651,   3426,    576,    573,  14140,  30961,   4100,    603,
           713, 235265,    747, 235265, 150250, 235265,   2475,    729,   7565,
           575, 235248, 235274, 235315, 235318, 235308,    575, 170171, 235269,
          6879, 235265,   2475,    603,    476,   7149,   3426,    578, 200524,
        235265,   2475,    919,   5952,   6861,  40354,    578,   8469,   5678,
          3069, 235303, 235256,   6142,   1105,    573,   1913,    576,    476,
          3486,  50619,   86

In [59]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
who is the author of harry potter series?<end_of_turn>
<start_of_turn>model
The author of the Harry Potter series is J. K. Rowling. She was born in 1965 in Gloucestershire, England. She is a British author and philanthropist. She has written seven novels and eight related children's books about the life of a young wizard named Harry Potter. The series has been a global phenomenon, with over 500 million copies sold worldwide.<eos>



In [60]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: who is the author of harry potter series?

Output text:
The author of the Harry Potter series is J. K. Rowling. She was born in 1965 in Gloucestershire, England. She is a British author and philanthropist. She has written seven novels and eight related children's books about the life of a young wizard named Harry Potter. The series has been a global phenomenon, with over 500 million copies sold worldwide.


In [114]:
# Harry potter questions generated with Gemini
# gemini_questions = [
# "Who are the three Deathly Hallows and what are their powers?",
# "Describe the history and significance of the Philosopher's Stone.",
# "Compare and contrast the characters of Harry Potter and Draco Malfoy.",
# "Explain the concept of a Patronus and how it is created.",
# "What is the significance of the number seven in the Harry Potter series?"
# ]

# Manually created question list
manual_questions = [
    "Explain chapter one : THE BOY WHO LIVED in few words",
    # "How does Dumbledore die in the novel?",
    # "Is professor Snape lover of harry potter's mother?",
    # "Which train station do people usually go to in london to get a train to hogwarts?",
    # "What are the names of twin brothers who are friends of harry potter?"
]

query_list = manual_questions

In [115]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: Explain chapter one : THE BOY WHO LIVED in few words
[INFO] Time taken to get scores on 10396 embeddings: 0.00008 seconds.


(tensor([0.6003, 0.5518, 0.5477, 0.5393, 0.5393, 0.5348, 0.5327, 0.5324, 0.5287,
         0.5284], device='cuda:0'),
 tensor([ 9359,   195,     4,  2787,  1546, 10235,  9360,  2790,  7481, 10269],
        device='cuda:0'))

In [116]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory and comprehensive as possible, drawing on the provided context.
Use the following examples as reference for the ideal answer style.

\nExample 1:
Query: What is the significance of the Mirror of Erised?
Answer: The Mirror of Erised is a magical artifact that shows the deepest, most desperate desire of a person's heart. It does not produce real objects, but rather a reflection of the viewer's most profound longing. For Harry Potter, this was seeing his parents alive and well. The mirror's name is Erised backwards, meaning "I see red," indicating that it reveals desires, often connected to love and loss.

\nExample 2:
Query: How does a Patronus charm work?
Answer: A Patronus charm creates a positive, powerful force in the form of an animal to repel Dementors. These creatures feed on human happiness, causing feelings of despair and hopelessness. By conjuring a Patronus, a wizard or witch can shield themselves from a Dementor's influence. The form of the Patronus often reflects the individual's personality or experiences.

\nExample 3:
Query: What are the Hogwarts Houses and their characteristics?
Answer: Hogwarts School of Witchcraft and Wizardry is divided into four houses: Gryffindor, Hufflepuff, Ravenclaw, and Slytherin. Gryffindor values courage, chivalry, and determination. Hufflepuff emphasizes hard work, loyalty, and fair play. Ravenclaw prizes intelligence, wit, and a love of learning. Slytherin characteristics include ambition, cunning, and resourcefulness.

\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [117]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: Explain chapter one : THE BOY WHO LIVED in few words
[INFO] Time taken to get scores on 10396 embeddings: 0.00006 seconds.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory and comprehensive as possible, drawing on the provided context.
Use the following examples as reference for the ideal answer style.


Example 1:
Query: What is the significance of the Mirror of Erised?
Answer: The Mirror of Erised is a magical artifact that shows the deepest, most desperate desire of a person's heart. It does not produce real objects, but rather a reflection of the viewer's most profound longing. For Harry Potter, this was seeing his parents alive and well. The mirror's name is Erised backwards, meaning "I see red," indicating that it reveals desires, often con

In [118]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=1, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: Explain chapter one : THE BOY WHO LIVED in few words
RAG answer:
<bos>The passage describes the emotional state of a young boy named Tom Riddle. He feels a strong desire to see and destroy something, but he is unable to do so due to its fate. He envies his parents' deaths and feels a fear that he will be like them. He also feels a deep sense of loss when he thinks about past glories that he has missed out on.<eos>
CPU times: total: 4.38 s
Wall time: 5.39 s
