## Import PDF

In [1]:
from pathlib import Path
import requests

import pymupdf
from tqdm.auto import tqdm
import random

import pandas as pd

import re

### Download PDF

In [2]:
def download_pdf(url, file_path):
    file_path = Path(file_path)
    if not file_path.exists():
        print(f'[INFO] File doesn\'t exist. Downloading: {url}')
        response = requests.get(url)
        if response.status_code == 200:
            with file_path.open('wb') as file:
                file.write(response.content)
            print(f'[INFO] The file has been downloaded: {file_path}')
        else:
            print(f'[INFO] Failed to download file. Status code: {response.status_code}')
    else:
        print(f'[INFO] File exists at {file_path.resolve()}')

In [3]:
file_path = 'human-nutrition-text.pdf'
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
download_pdf(url, file_path)

[INFO] File exists at /Users/nc1514/Documents/GitHub/rag-from-scratch/human-nutrition-text.pdf


### Open PDF

In [4]:
def text_formatter(text: str) -> str:
    
    cleaned_text = text.replace('\n', ' ').strip()
    
    return cleaned_text

def open_and_read_pdf(file_path, starting_num = 1):
    pdf_document = pymupdf.open(file_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(pdf_document), total=len(pdf_document)):
        text = text_formatter(page.get_text())
        pages_and_texts.append({'page_number': page_number - starting_num,
                                'page_char_count': len(text),
                                'page_word_count': len(text.split(' ')),
                                'page_sentence_count_raw': len(text.split('. ')),
                                'page_token_count': len(text) / 4, # rough estimate of tokens
                                'text': text})
    return pages_and_texts

In [5]:
pages_and_texts = open_and_read_pdf(file_path, 41)

  0%|          | 0/1208 [00:00<?, ?it/s]

In [6]:
random.sample(pages_and_texts, 2)

[{'page_number': 1145,
  'page_char_count': 1954,
  'page_word_count': 291,
  'page_sentence_count_raw': 15,
  'page_token_count': 488.5,
  'text': 'nursing-care facilities. This requires creating meal plans and  providing nutritional guidance to help patients restore their health  or manage chronic conditions. Clinical dietitians also confer with  doctors and other health-care professionals to coordinate dietary  recommendations with medical needs. Nutrition jobs in the  community often involve working in public health clinics,  cooperative extension offices, and HMOs to prevent disease and  promote the health of the local community. Nutrition jobs in the  nonprofit world involve anti-hunger organizations, public health  organizations, and activist groups.  Nutritionists and dietitians can also find work in the private  sector. Increased public awareness of food, diet, and nutrition has  led to employment opportunities in advertising, marketing, and  food manufacturing. Dietitians wor

### Convert to Dataframe

In [7]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [8]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


## Preprocess Text Into Chunks

In [9]:
from spacy.lang.en import English

In [10]:
nlp = English()
nlp.add_pipe('sentencizer')
doc = nlp('This is a sentence. This is another sentence.')

In [11]:
assert len(list(doc.sents)) == 2
list(doc.sents)

[This is a sentence., This is another sentence.]

In [12]:
for item in tqdm(pages_and_texts):
    item['sentences'] = list(nlp(item['text']).sents)
    item['sentences'] = [str(sentence) for sentence in item['sentences']]
    item['page_sentence_count_spacy'] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
random.sample(pages_and_texts, 2)

[{'page_number': 573,
  'page_char_count': 1113,
  'page_word_count': 190,
  'page_sentence_count_raw': 11,
  'page_token_count': 278.25,
  'text': 'making it unavailable for absorption. A rare genetic disease-causing  malfunction of the biotinidase enzyme also results in biotin  deficiency. Symptoms of biotin deficiency are similar to those of  other B vitamins, but may also include hair loss when severe.  Dietary Reference Intakes  Because there is little information on the requirements for biotin,  the FNB has developed Adequate Intakes (AI) based on the observed  dietary intakes in healthy population groups. The AI for different age  groups for biotin are listed in Table 9.21 “Dietary Reference Intakes  for Biotin”.  Table 9.21 Dietary Reference Intakes for Biotin  Age Group  AI Males and Females mcg/day)  Infants (0–6 months)  5  Infants (7–12 months)  6  Children (1–3 years)  8  Children (4–8 years)  12  Children (9–13 years)  20  Adolescents (14–18 years) 25  Adults (> 19 years)

In [14]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.0,,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


### Chunking Sentences

In [15]:
num_sentence_chunk_size = 10
def split_list(input_list: list[str],
               slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
    item['sentence_chunks'] = split_list(input_list=item['sentences'],
                                         slice_size=num_sentence_chunk_size)
    item['num_chunks'] = len(item['sentence_chunks'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [16]:
random.sample(pages_and_texts, k=1)

[{'page_number': 122,
  'page_char_count': 2140,
  'page_word_count': 379,
  'page_sentence_count_raw': 23,
  'page_token_count': 535.0,
  'text': 'Bone Anatomy and Structure  To optimize bone health through nutrition, it is important to  understand bone anatomy. The skeleton is composed of two main  parts, the axial and the appendicular parts. The axial skeleton  consists of the skull, vertebral column, and rib cage, and is  composed of eighty bones. The appendicular skeleton consists of  the shoulder girdle, pelvic girdle, and upper and lower extremities,  and is composed of 126 bones. Bones are also categorized by size  and shape. There are four types of bone: long bones, short bones,  flat bones, and irregular bones. The longest bone in your body is the  femur (thigh bone), which extends from your hip to your knee. It is a  long bone and functions to support your weight as you stand, walk,  or run. Your wrist is composed of eight irregular-shaped bones,  which allow for the intrica

In [17]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [18]:
# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [19]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 6,
  'sentence_chunk': 'down digestible complex carbohydrates to simple sugars, mostly glucose. Glucose is then transported to all our cells where it is stored, used to make energy, or used to build macromolecules. Fiber is also a complex carbohydrate, but it cannot be broken down by digestive enzymes in the human intestine. As a result, it passes through the digestive tract undigested unless the bacteria that inhabit the colon or large intestine break it down. One gram of digestible carbohydrates yields four kilocalories of energy for the cells in the body to perform work. In addition to providing energy and serving as building blocks for bigger macromolecules, carbohydrates are essential for proper functioning of the nervous system, heart, and kidneys. As mentioned, glucose can be stored in the body for future use. In humans, the storage molecule of carbohydrates is called glycogen, and in plants, it is known as starch. Glycogen and starch are complex carbohydrates. 

In [20]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [21]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

Chunk token count: 20.5 | Text: PART XVI CHAPTER 16. PERFORMANCE NUTRITION Chapter 16. Performance Nutrition | 931
Chunk token count: 13.0 | Text: US Department of Agriculture, 1136 | Food Insecurity
Chunk token count: 5.25 | Text: 754 | MyPlate Planner
Chunk token count: 12.75 | Text: https:/ /www.fda.gov/food/ 1022 | Food Preservation
Chunk token count: 4.5 | Text: 708 | Introduction


[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [23]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 265,
  'sentence_chunk': 'Despite the claims these diets make, there is little scientific evidence to support that low-carbohydrate diets are significantly better than other diets in promoting long-term weight loss. A study in The Nutritional 5.\xa0Elliott SS, Keim NL, et al. (2002). Fructose, Weight Gain, and the Insulin Resistance Syndrome. American Journal of Clinical Nutrition,\xa076(5),911–22.http:/ /www.ajcn.org/ content/76/5/911.full. Accessed September 27, 2017. Health Consequences and Benefits of High-Carbohydrate Diets | 265',
  'chunk_char_count': 508,
  'chunk_word_count': 67,
  'chunk_token_count': 127.0}]

### Embedding Text Chunks

In [1]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")



model.safetensors:  79%|#######9  | 346M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
%%time # gpu faster to embed chunks
embedding_model.to('mps')
for item in tqdm(pages_and_chunks_over_min_token_len):
    item['embedding'] = embedding_model.encode(item['sentence_chunk'])

In [None]:
# Chunk embedding

text_chunks = [item['sentence_chunk'] for item in pages_and_chunks_over_min_token_len]

In [None]:
%%time

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32,
                                               convert_to_tensor=True)