In [1]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://sdgs.un.org/sites/default/files/2024-05/Lui_Renewable%20energy.pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [3]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
5it [00:00, 55.82it/s]


[{'page_number': -41,
  'page_char_count': 4728,
  'page_word_count': 774,
  'page_sentence_count_raw': 25,
  'page_token_count': 1182.0,
  'text': "Case Study for the Multistakeholder Forum on Science, Technology and Innovation for the SDGs, May 2024    Renewable Energy: Emerging technologies and innovations to reduce climate change  Robyn Lui, Office of Innovation, UNICEF  Note: The findings, interpretations and conclusions expressed in this science-policy brief are those of the researchers and authors,  and do not necessarily reflect UNICEF policies or approaches.  Abstract  This science-policy brief explores emerging solar technologies and energy storage innovations to address climate  change and advance energy security. It also outlines how renewable energy technologies power up multiple SDG results,  examines the trade-offs and recommend actions to accelerate impact and manage trade-offs.  Access to energy is key to human development and wellbeing but the world is not on track to

In [4]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': -38,
  'page_char_count': 4329,
  'page_word_count': 711,
  'page_sentence_count_raw': 34,
  'page_token_count': 1082.25,
  'text': "Case Study for the Multistakeholder Forum on Science, Technology and Innovation for the SDGs, May 2024    4    Other trade-offs include:  •  Modern slavery. The global renewable energy  supply chains are susceptible to modern slavery  and forced labour.20 The risk is particularly high in  the polysilicon solar market where production and  procurement practices lack transparency and  accountability.21  •  Impact of critical mineral extraction. The  Business and Human Rights Resource Centre  monitors companies that mine commodities vital to  the clean energy transition.22 From 2010 to 2021, it  identified 495 allegations of human rights abuse  and attacks against Human Rights Defenders and  Indigenous peoples.23  •  Forced displacement. Large scale renewable  energy projects and mining critical minerals can  result in forced displacement. P

In [7]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,4728,774,25,1182.0,Case Study for the Multistakeholder Forum on S...
1,-40,1911,309,9,477.75,Case Study for the Multistakeholder Forum on S...
2,-39,2557,443,13,639.25,Case Study for the Multistakeholder Forum on S...
3,-38,4329,711,34,1082.25,Case Study for the Multistakeholder Forum on S...
4,-37,4749,695,133,1187.25,Case Study for the Multistakeholder Forum on S...


In [8]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,5.0,5.0,5.0,5.0,5.0
mean,-39.0,3654.8,586.4,42.8,913.7
std,1.58,1327.55,200.02,51.38,331.89
min,-41.0,1911.0,309.0,9.0,477.75
25%,-40.0,2557.0,443.0,13.0,639.25
50%,-39.0,4329.0,695.0,25.0,1082.25
75%,-38.0,4728.0,711.0,34.0,1182.0
max,-37.0,4749.0,774.0,133.0,1187.25


In [15]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [16]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 5/5 [00:00<00:00, 36.18it/s]


In [17]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': -39,
  'page_char_count': 2557,
  'page_word_count': 443,
  'page_sentence_count_raw': 13,
  'page_token_count': 639.25,
  'text': 'Case Study for the Multistakeholder Forum on Science, Technology and Innovation for the SDGs, May 2024    3      • Provide lighting, heating, cooling, and clean water.   • Reduce indoor air pollution due to cooking electrification.12  4: Quality education     • Increase school attendance, support remote learning and study outside of  daylight hours.   • Encourage teacher retention.13  5: Gender equality     • Increase school attendance and ability to study at home.   • Reduce effort for water and fuel collection, provide safety at night, and boost  livelihood opportunities.14  6: Clean water and  sanitation   • Ensure access to sustainable, affordable, and clean water services. Safe clean  water reduces children’s exposure to deadly waterborne diseases.15  • Improve food security with solar-powered water pumps.  7: Affordable and  clean en

In [18]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,-39.0,3654.8,586.4,42.8,913.7,40.8
std,1.58,1327.55,200.02,51.38,331.89,47.0
min,-41.0,1911.0,309.0,9.0,477.75,9.0
25%,-40.0,2557.0,443.0,13.0,639.25,13.0
50%,-39.0,4329.0,695.0,25.0,1082.25,25.0
75%,-38.0,4728.0,711.0,34.0,1182.0,34.0
max,-37.0,4749.0,774.0,133.0,1187.25,123.0


In [19]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 5/5 [00:00<00:00, 3986.22it/s]


In [20]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': -40,
  'page_char_count': 1911,
  'page_word_count': 309,
  'page_sentence_count_raw': 9,
  'page_token_count': 477.75,
  'text': 'Case Study for the Multistakeholder Forum on Science, Technology and Innovation for the SDGs, May 2024    2    Batteries are good for short-duration storage. But a lot  of batteries are needed to deliver 8-12 hours of  electricity. Hence, storage technologies like flow  batteries, pumped hydro, and thermal storage, both  commercial and under development, are gaining  traction for their long duration and large capacity  storage ability.  Thermal Energy Storage (TES) technologies present one  of the most promising innovations to convert electricity  into heat for storage until required – hours, days, even  months later – by factories, buildings, or towns. It can  help decouple heating and cooling demand from  immediate power generation and supply availability, to  balance seasonal demand, and to reduce the need for  costly grid reinforcements

In [21]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,-39.0,3654.8,586.4,42.8,913.7,40.8,4.6
std,1.58,1327.55,200.02,51.38,331.89,47.0,4.83
min,-41.0,1911.0,309.0,9.0,477.75,9.0,1.0
25%,-40.0,2557.0,443.0,13.0,639.25,13.0,2.0
50%,-39.0,4329.0,695.0,25.0,1082.25,25.0,3.0
75%,-38.0,4728.0,711.0,34.0,1182.0,34.0,4.0
max,-37.0,4749.0,774.0,133.0,1187.25,123.0,13.0


In [22]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

100%|██████████| 5/5 [00:00<00:00, 4377.27it/s]


23

In [23]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': -37,
  'sentence_chunk': '20. Walk Free. (2023). The Global Slavery Index 2023, Minderoo Foundation. https://www.walkfree.org/global-slavery-index/ 21. Murphy, L. and Elimä, N., In Broad Daylight: Uyghur Forced Labour and Global Solar Supply Chains, Sheffield Hallam University, Helena Kennedy Centre for International Justice, Sheffield, UK, 2021. Around 95 percent of all solar modules are made with polysilicon. 22. Business and Human Rights Resource Centre. (May 2022).',
  'chunk_char_count': 447,
  'chunk_word_count': 61,
  'chunk_token_count': 111.75}]

In [24]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,23.0,23.0,23.0,23.0
mean,-38.0,773.39,107.13,193.35
std,1.45,634.27,96.47,158.57
min,-41.0,136.0,13.0,34.0
25%,-38.5,358.5,44.0,89.62
50%,-37.0,447.0,61.0,111.75
75%,-37.0,1106.0,149.5,276.5
max,-37.0,2073.0,306.0,518.25


In [None]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

ValueError: a must be greater than 0 unless no samples are taken

In [27]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -41,
  'sentence_chunk': 'Case Study for the Multistakeholder Forum on Science, Technology and Innovation for the SDGs, May 2024  Renewable Energy: Emerging technologies and innovations to reduce climate change Robyn Lui, Office of Innovation, UNICEF Note: The findings, interpretations and conclusions expressed in this science-policy brief are those of the researchers and authors, and do not necessarily reflect UNICEF policies or approaches. Abstract This science-policy brief explores emerging solar technologies and energy storage innovations to address climate change and advance energy security. It also outlines how renewable energy technologies power up multiple SDG results, examines the trade-offs and recommend actions to accelerate impact and manage trade-offs. Access to energy is key to human development and wellbeing but the world is not on track to achieve SDG 7 - ensuring access to affordable, reliable, sustainable, and modern energy for all. Solar technologies

In [28]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07982659e-02  3.03164814e-02 -2.01217812e-02  6.86484948e-02
 -2.55256258e-02 -8.47686827e-03 -2.07231977e-04 -6.32377416e-02
  2.81606596e-02 -3.33353728e-02  3.02633960e-02  5.30721396e-02
 -5.03526554e-02  2.62288544e-02  3.33313718e-02 -4.51577306e-02
  3.63045074e-02 -1.37121335e-03 -1.20171625e-02  1.14947166e-02
  5.04510924e-02  4.70856801e-02  2.11914051e-02  5.14606386e-02
 -2.03746390e-02 -3.58889215e-02 -6.67755026e-04 -2.94393897e-02
  4.95859198e-02 -1.05639463e-02 -1.52014066e-02 -1.31760491e-03
  4.48197499e-02  1.56023446e-02  8.60379259e-07 -1.21392065e-03
 -2.37978753e-02 -9.09372466e-04  7.34484568e-03 -2.53931386e-03
  5.23370616e-02 -4.68043797e-02  1.66214872e-02  4.71579544e-02
 -4.15599197e-02  9.01963329e-04  3.60278040e-02  3.42213996e-02
  9.68226939e-02  5.94829135e-02 -1.64984576e-02 -3.51249389e-02
  5.92516316e-03 -7.07909290e-04 -2.4103

In [29]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97448079e-02 -4.51076636e-03 -4.98487381e-03  6.55445009e-02
 -9.87673923e-03  2.72835921e-02  3.66426446e-02 -3.30219488e-03
  8.50078370e-03  8.24952591e-03 -2.28497572e-02  4.02430035e-02
 -5.75200468e-02  6.33692071e-02  4.43207324e-02 -4.49506305e-02
  1.25284633e-02 -2.52011809e-02 -3.55292968e-02  1.29559245e-02
  8.67021270e-03 -1.92917809e-02  3.55636817e-03  1.89505816e-02
 -1.47128142e-02 -9.39845107e-03  7.64174573e-03  9.62185301e-03
 -5.98922325e-03 -3.90168838e-02 -5.47824688e-02 -5.67454379e-03
  1.11644613e-02  4.08067293e-02  1.76319122e-06  9.15304385e-03
 -8.77259858e-03  2.39382703e-02 -2.32784487e-02  8.05000216e-02
  3.19177061e-02  5.12601668e-03 -1.47708217e-02 -1.62524972e-02
 -6.03212900e-02 -4.35689948e-02  4.51211371e-02 -1.79053862e-02
  2.63366625e-02 -3.47866677e-02 -8.89173150e-03 -5.47675341e-02
 -1.24373063e-02 -2.38606650e-02  8.33497196e-02  5.71241640e-02
  1.13328611e-02 -1.49594918e-02  9.2037

In [30]:
%%time

# Send the model to the GPU
embedding_model.to("cpu") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 23/23 [00:08<00:00,  2.83it/s]

CPU times: total: 24.6 s
Wall time: 8.15 s





In [31]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [32]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 45.1 s
Wall time: 15.3 s


tensor([[ 0.0202, -0.0014, -0.0124,  ...,  0.0055, -0.0155,  0.0175],
        [ 0.0136, -0.0160, -0.0415,  ..., -0.0403, -0.0254,  0.0066],
        [ 0.0261, -0.0157, -0.0077,  ..., -0.0506,  0.0199,  0.0049],
        ...,
        [ 0.0322,  0.0309, -0.0222,  ...,  0.0133, -0.0359,  0.0548],
        [ 0.0265, -0.0224, -0.0196,  ..., -0.0229, -0.0157,  0.0124],
        [ 0.0015, -0.0109, -0.0135,  ..., -0.0251, -0.0392, -0.0059]])

In [33]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [34]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-41,Case Study for the Multistakeholder Forum on S...,2073,306,518.25,[ 2.01921165e-02 -1.36858027e-03 -1.23747298e-...
1,-41,Researchers from Stanford University analysed ...,1556,227,389.0,[ 1.36222430e-02 -1.60285626e-02 -4.15348895e-...
2,-41,Organic PVs are low-cost and environmentally f...,999,143,249.75,[ 2.61472929e-02 -1.56916305e-02 -7.71244476e-...
3,-40,Case Study for the Multistakeholder Forum on S...,1872,270,468.0,[-2.55917665e-02 5.90757513e-03 -2.37189308e-...
4,-39,Case Study for the Multistakeholder Forum on S...,2018,305,504.5,[-9.36057046e-03 2.76851524e-02 -2.43335385e-...


In [35]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)

In [36]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-41,Case Study for the Multistakeholder Forum on S...,2073,306,518.25,"[0.0201921165, -0.00136858027, -0.0123747298, ..."
1,-41,Researchers from Stanford University analysed ...,1556,227,389.0,"[0.013622243, -0.0160285626, -0.0415348895, 0...."
2,-41,Organic PVs are low-cost and environmentally f...,999,143,249.75,"[0.0261472929, -0.0156916305, -0.00771244476, ..."
3,-40,Case Study for the Multistakeholder Forum on S...,1872,270,468.0,"[-0.0255917665, 0.00590757513, -0.0237189308, ..."
4,-39,Case Study for the Multistakeholder Forum on S...,2018,305,504.5,"[-0.00936057046, 0.0276851524, -0.0243335385, ..."


In [37]:
embeddings[0]

tensor([ 2.0192e-02, -1.3686e-03, -1.2375e-02,  7.4323e-03, -2.0181e-03,
        -3.5463e-02,  4.0824e-02, -2.3779e-02,  4.4354e-02,  1.9439e-02,
         1.5064e-02,  9.5261e-02,  3.6612e-02,  4.1251e-02,  7.6210e-02,
        -2.0806e-02,  8.2277e-02,  1.5139e-02,  1.2323e-02,  2.0164e-02,
         5.8316e-03, -3.4704e-02,  3.4287e-02,  4.0648e-03, -1.0374e-02,
        -3.7577e-03, -1.3932e-02,  4.2626e-02,  1.1352e-02, -1.0684e-01,
         6.8026e-02,  7.8377e-03, -2.5849e-02,  1.5089e-04,  2.6157e-06,
        -5.8968e-02, -2.9156e-02,  2.0333e-03,  2.8752e-02,  2.6285e-02,
         6.9932e-02, -2.1903e-02, -3.8006e-02, -9.7807e-03,  1.8709e-02,
        -4.3146e-02,  4.0266e-02, -8.2084e-03, -1.1345e-02, -4.1535e-02,
        -1.7022e-02, -9.1704e-03,  1.2438e-02,  1.0670e-02, -5.6216e-02,
         5.2156e-02, -6.9829e-03,  3.4000e-02,  3.0435e-03, -2.3817e-02,
        -3.5650e-02,  8.4144e-02, -6.1810e-02, -1.6486e-02,  6.6228e-02,
         6.6821e-03, -4.7666e-02, -8.1185e-04,  1.0

In [38]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device) # choose the device to load the model to

In [39]:
# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "functions"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples 
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: functions
Time take to get scores on 23 embeddings: 0.00033 seconds.


torch.return_types.topk(
values=tensor([0.1375, 0.1358, 0.1185, 0.1135, 0.1094]),
indices=tensor([17, 15, 16,  3,  9]))

In [40]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

Embeddings shape: torch.Size([2300, 768])
Time take to get scores on 2300 embeddings: 0.00276 seconds.


In [41]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [42]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'functions'

Results:
Score: 0.1375
Text:
16. United Nations. (n.d.). SDG7: Affordable and clean
energy.https://sdgs.un.org/goals/goal7. 17. United Nations. (n.d.). SDG8: Decent
work and economic growth.https://sdgs.un.org/goals/goal8.
Page number: -37


Score: 0.1358
Text:
12. United Nations. (n.d.). SDG3: Good health and well-
being.https://sdgs.un.org/goals/goal3. 13. United Nations. (n.d.). SDG4: Quality
education. https://sdgs.un.org/goals/goal4.
Page number: -37


Score: 0.1185
Text:
14. United Nations. (n.d.). SDG5: Gender equality.
https://sdgs.un.org/goals/goal5. 15. United Nations. (n.d.). SDG13: Climate
action. https://sdgs.un.org/goals/goal13.
Page number: -37


Score: 0.1135
Text:
Case Study for the Multistakeholder Forum on Science, Technology and Innovation
for the SDGs, May 2024  2  Batteries are good for short-duration storage. But a
lot of batteries are needed to deliver 8-12 hours of electricity. Hence, storage
technologies like flow batteries, pumped hydro, a

In [43]:
import fitz

# Open PDF and load target page
pdf_path = "human-nutrition-text.pdf" # requires PDF to be downloaded
doc = fitz.open(pdf_path)
page = doc.load_page(5 + 41) # number of page (our doc starts page numbers on page 41)

# Get the image of the page
img = page.get_pixmap(dpi=300)

# Optional: save the image
#img.save("output_filename.png")
doc.close()

# Convert the Pixmap to a numpy array
img_array = np.frombuffer(img.samples_mv, 
                          dtype=np.uint8).reshape((img.h, img.w, img.n))

# Display the image using Matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off') # Turn off axis
plt.show()

ValueError: page not in document

In [44]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

Dot product between vector1 and vector2: tensor(14.)
Dot product between vector1 and vector3: tensor(32.)
Dot product between vector1 and vector4: tensor(-14.)
Cosine similarity between vector1 and vector2: tensor(1.0000)
Cosine similarity between vector1 and vector3: tensor(0.9746)
Cosine similarity between vector1 and vector4: tensor(-1.0000)


In [45]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [46]:
query = "symptoms of pellagra"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 23 embeddings: 0.00008 seconds.


(tensor([-0.0061, -0.0129, -0.0464, -0.0486, -0.0502]),
 tensor([11,  5, 20,  7, 17]))

In [47]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 23 embeddings: 0.00008 seconds.
Query: symptoms of pellagra

Results:
Score: -0.0061
p.40. 3. Vohra, K., Vodonos, A., Schwartz, J., Marais, E. A., Sulprizio, M. P.,
& Mickley, L. J. (2021). Global mortality from outdoor fine particle pollution
generated by fossil fuel combustion: Results from GEOS-Chem. Environmental
Research, vol.195, 110754.doi: 10.1016/j.envres.2021.110754. 4. Stanford
University. (18 July 2023).
Page number: -37


Score: -0.0129
Geothermal energy  • May emit carbon dioxide, silica, methane, ammonia, and
sulphur dioxide, and depending upon the depth and location of the reservoir,
some may contain lethal substances such as boron, mercury, and arsenic.
Bioenergy  • Energy created by burning biomass creates greenhouse gas emissions,
but at lower levels than burning fossil fuels. Ocean energy  • Equipment used to
capture this ocean energy can disrupt and destroy marine life and the ocean’s
ecosystems.
Page number: -39


Score: -0.0464
