## Import PDF

In [22]:
from pathlib import Path
import requests

import pymupdf
from tqdm.auto import tqdm
import random

import pandas as pd

import re

### Download PDF

In [4]:
def download_pdf(url, file_path):
    file_path = Path(file_path)
    if not file_path.exists():
        print(f'[INFO] File doesn\'t exist. Downloading: {url}')
        response = requests.get(url)
        if response.status_code == 200:
            with file_path.open('wb') as file:
                file.write(response.content)
            print(f'[INFO] The file has been downloaded: {file_path}')
        else:
            print(f'[INFO] Failed to download file. Status code: {response.status_code}')
    else:
        print(f'[INFO] File exists at {file_path.resolve()}')

In [5]:
file_path = 'human-nutrition-text.pdf'
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
download_pdf(url, file_path)

[INFO] File exists at /Users/nc1514/Documents/GitHub/rag-from-scratch/human-nutrition-text.pdf


### Open PDF

In [6]:
def text_formatter(text: str) -> str:
    
    cleaned_text = text.replace('\n', ' ').strip()
    
    return cleaned_text

def open_and_read_pdf(file_path, starting_num = 1):
    pdf_document = pymupdf.open(file_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(pdf_document), total=len(pdf_document)):
        text = text_formatter(page.get_text())
        pages_and_texts.append({'page_number': page_number - starting_num,
                                'page_char_count': len(text),
                                'page_word_count': len(text.split(' ')),
                                'page_sentence_count_raw': len(text.split('. ')),
                                'page_token_count': len(text) / 4, # rough estimate of tokens
                                'text': text})
    return pages_and_texts

In [7]:
pages_and_texts = open_and_read_pdf(file_path, 41)

  0%|          | 0/1208 [00:00<?, ?it/s]

In [8]:
random.sample(pages_and_texts, 2)

[{'page_number': 183,
  'page_char_count': 165,
  'page_word_count': 37,
  'page_sentence_count_raw': 2,
  'page_token_count': 41.25,
  'text': 'Sodium  levels in  milligrams is  a required  listing on a  Nutrition  Facts label.  Sodium on the Nutrition Facts Panel  Figure 3.10 Nutrition Label  Sodium  |  183'},
 {'page_number': 341,
  'page_char_count': 1946,
  'page_word_count': 336,
  'page_sentence_count_raw': 17,
  'page_token_count': 486.5,
  'text': 'initially contain polyunsaturated fatty acids. When the process of  hydrogenation is not complete, for example, not all carbon double  bonds have been saturated the end result is a partially hydrogenated  oil. The resulting oil is not fully solid. Total hydrogenation makes  the oil very hard and virtually unusable. Some newer products are  now using fully hydrogenated oil combined with nonhydrogenated  vegetable oils to create a usable fat.  Manufacturers favor hydrogenation as a way to prevent oxidation  of oils and ensure longer s

### Convert to Dataframe

In [9]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [10]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


## Preprocess Text Into Chunks

In [1]:
from spacy.lang.en import English

In [12]:
nlp = English()
nlp.add_pipe('sentencizer')
doc = nlp('This is a sentence. This is another sentence.')

In [13]:
assert len(list(doc.sents)) == 2
list(doc.sents)

[This is a sentence., This is another sentence.]

In [14]:
for item in tqdm(pages_and_texts):
    item['sentences'] = list(nlp(item['text']).sents)
    item['sentences'] = [str(sentence) for sentence in item['sentences']]
    item['page_sentence_count_spacy'] = len(item['sentences'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [15]:
random.sample(pages_and_texts, 2)

[{'page_number': -34,
  'page_char_count': 1047,
  'page_word_count': 186,
  'page_sentence_count_raw': 2,
  'page_token_count': 261.75,
  'text': 'Indicators of Health: Body Mass Index, Body Fat  Content, and Fat Distribution  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  133  Part\xa0III.\xa0Chapter 3. Water and Electrolytes  Introduction  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  145  Overview of Fluid and Electrolyte Balance  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  150  Water’s Importance to Vitality  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  156  Regulation of Water Balance  University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program and Human Nutrition  Program  162  Electrolytes Important for Fluid Balance  University of Hawai‘i

In [16]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,-40,0,1,1,0.0,,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


### Chunking Sentences

In [17]:
num_sentence_chunk_size = 10
def split_list(input_list: list[str],
               slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
    item['sentence_chunks'] = split_list(input_list=item['sentences'],
                                         slice_size=num_sentence_chunk_size)
    item['num_chunks'] = len(item['sentence_chunks'])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [18]:
random.sample(pages_and_texts, k=1)

[{'page_number': 313,
  'page_char_count': 1887,
  'page_word_count': 321,
  'page_sentence_count_raw': 16,
  'page_token_count': 471.75,
  'text': 'Nonessential and Essential  Fatty Acids  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Fatty acids are vital for the normal operation of all body systems.  The circulatory system, respiratory system, integumentary system,  immune system, brain, and other organs require fatty acids for  proper function. The body is capable of synthesizing most of the  fatty acids it needs from food. These fatty acids are known as  nonessential fatty acids. However, there are some fatty acids that  the body cannot synthesize and these are called essential fatty  acids. It is important to note that nonessential fatty acids doesn’t  mean unimportant; the classification is based solely on the ability of  the body to synthesize the fatty acid.  Essential fatty acids must be obtained from food. They fall int

In [19]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [23]:
# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [24]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 104,
  'sentence_chunk': 'cells, pick up oxygen to be transported to tissues throughout the body. The pulmonary artery carries deoxygenated blood to the lungs. The pulmonary artery branches multiple times as it follows the bronchi, and each branch becomes progressively smaller in diameter down to the tiny capillaries where the alveoli release carbon dioxide from blood into the lungs to be exhaled and take up oxygen from inhaled air to oxygenate the blood. Once the blood is oxygenated, it drains from the alveoli by way of multiple pulmonary veins that exit the lungs to carry oxygen to the rest of the body. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mob

In [25]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [26]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 15.75 | Text: PART IV CHAPTER 4. CARBOHYDRATES Chapter 4. Carbohydrates | 227
Chunk token count: 13.25 | Text: PART IX CHAPTER 9. VITAMINS Chapter 9. Vitamins | 513
Chunk token count: 22.0 | Text: Figure 6.10 Enzymes Role in Carbohydrate Digestion Protein’s Functions in the Body | 385
Chunk token count: 16.75 | Text: http:/ /www.aafp.org/afp/ 2002/1001/p1217.html. 862 | Toddler Years
Chunk token count: 7.0 | Text: Water-Soluble Vitamins | 553


In [27]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]