In [6]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    response = requests.get(url)

    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        print(f"[INFO] The file has been download and saved as {pdf_path}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {response.status_code}")

else:
    print(f"[INFO] File {pdf_path} exists.")

[INFO] File doesn't exist, downloading...
[INFO] The file has been download and saved as human-nutrition-text.pdf


In [46]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text"""

    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)

        pages_and_texts.append({
            "page_number": page_number - 41, # account for offset in the PDF
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })

    return pages_and_texts
    
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

1208it [00:01, 722.94it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [47]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 398,
  'page_char_count': 1271,
  'page_word_count': 214,
  'page_sentence_count_raw': 8,
  'page_token_count': 317.75,
  'text': 'Japanese  nurse with  dependent  children  having  typical  appearance  of  malnutrition , New Bilibid  Prison,  September-O ctober 1945  by Unknown  / Public  Domain  word, meaning “starvation.” The syndrome affects more than fifty  million children under age five worldwide. It is characterized by  an extreme emaciated appearance, poor skin health, and growth  retardation. The symptoms are acute fatigue, hunger, and diarrhea.  Figure 6.16 Children With Marasmus  Kwashiorkor and marasmus often coexist as a combined syndrome  termed marasmic kwashiorkor. Children with the combined  syndrome  have  variable  amounts  of  edema  and  the  characterizations and symptoms of marasmus. Although organ  system function is compromised by undernutrition, the ultimate  cause of death is usually infection. Undernutrition is intricately  linked with supp

In [48]:
# 1 token : 4 chars = x tokens : 100 chars
# x = chars / 4


In [49]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [50]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0
std,348.86,560.38,95.83,6.55,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,216.0,10.0,307.88
75%,864.25,1603.5,272.0,15.0,400.88
max,1166.0,2308.0,430.0,39.0,577.0


In [51]:
from spacy.lang.en import English

nlp = English()

# add sentencizer pipeline
nlp.add_pipe("sentencizer")

# create document instance as an example
doc = nlp("This is a sentence. This another sentence. I like elephants.")
len(list(doc.sents))

3

In [52]:
list(nlp(pages_and_texts[600]["text"]).sents)

[Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death.,
 This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.,
  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5.,
 Fact Sheets for Health Professionals: Thiamin.,
 National  Institute of Health, Office of Dietary Supplements.,
   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.,
  Accessed October 22, 2017.,
  Water-Soluble Vitamins  |  559]

In [53]:
# loop over each page
for item in tqdm(pages_and_texts):
    # get senteces
    item["sentences"] = list(nlp(item["text"]).sents)

    # convert sentences into string
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:01<00:00, 766.13it/s]


In [54]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 863,
 'page_word_count': 138,
 'page_sentence_count_raw': 9,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  | 

In [55]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,199.5,10.52,287.0,10.32
std,348.86,560.38,95.83,6.55,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,216.0,10.0,307.88,10.0
75%,864.25,1603.5,272.0,15.0,400.88,15.0
max,1166.0,2308.0,430.0,39.0,577.0,28.0


### Chunking our sentences together

In [68]:
sentence_chunk_size = 10

def chunk_sentences(sentences: list[str], chunk_size: int) -> list[list[str]]:
    chunked_sentences = chunks_test = [sentences[i: i+chunk_size] for i in range(0, len(sentences), chunk_size)]

    return chunked_sentences


In [76]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = chunk_sentences(item["sentences"], chunk_size=sentence_chunk_size)
    item["chunk_count"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 365274.26it/s]


In [77]:
pages_and_texts[600]

{'page_number': 559,
 'page_char_count': 863,
 'page_word_count': 138,
 'page_sentence_count_raw': 9,
 'page_token_count': 215.75,
 'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Korsakoff syndrome can cause similar symptoms as beriberi such  as confusion, loss of coordination, vision changes, hallucinations,  and may progress to coma and death. This condition is specific  to alcoholics as diets high in alcohol can cause thiamin deficiency.  Other individuals at risk include individuals who also consume diets  typically low in micronutrients such as those with eating disorders,  elderly, and individuals who have gone through gastric bypass  surgery.5  Figure 9.10 The Role of Thiamin  Figure 9.11 Beriberi, Thiamin Deficiency  5. Fact Sheets for Health Professionals: Thiamin. National  Institute of Health, Office of Dietary Supplements.   https://ods.od.nih.gov/factsheets/Thiamin- HealthProfessional/. Updated Feburary 11, 2016.  Accessed October 22, 2017.  Water-Soluble Vitamins  | 

In [86]:
import re

pages_and_chunks = []

# loop over each page
for item in tqdm(pages_and_texts):

    # loop over each sentence chunk
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # joined sentences in chunk to form a paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()

        # clean up text
        joined_sentence_chunk = re.sub(r"/.([A-Z])", r". \1", joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 56016.80it/s]


1843

In [87]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 220,
  'sentence_chunk': "2000).Association of Coffee and Caffeine Intake with the Risk of Parkinson’s Disease.Journal of the American Medical Association, 283(20), 2674–79. http://jamanetwork.com/journals/jama/fullarticle/ 192731.Accessed September 22, 2017. 3.Costa J, et al. (2010).Caffeine Exposure and the Risk of Parkinson’s Disease: A Systematic Review and Meta- Analysis of Observational Studies.Journal of Alzheimer's disease, 20, S221–38.",
  'chunk_char_count': 422,
  'chunk_word_count': 50,
  'chunk_token_count': 105.5}]

In [88]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,730.55,109.22,182.64
std,347.79,445.59,69.33,111.4
min,-41.0,12.0,3.0,3.0
25%,280.5,313.0,43.0,78.25
50%,586.0,743.0,111.0,185.75
75%,890.0,1112.0,168.0,278.0
max,1166.0,1823.0,290.0,455.75


In [89]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
2,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
3,-37,Contents Preface University of Hawai‘i at Māno...,765,115,191.25
4,-36,Lifestyles and Nutrition University of Hawai‘i...,939,142,234.75


### Filter chunks of text for short chunks

In [91]:
min_token_length = 30

for _, row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'chunk token count: {row["chunk_token_count"]} | text: {row["sentence_chunk"]}')

chunk token count: 28.75 | text: Journal of Nutrition, 138(6), 1250S–4S. http://jn.nutrition.org/content/138/6/ 1250S.long The Digestive System | 71
chunk token count: 3.5 | text: 190 | Chloride
chunk token count: 20.5 | text: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=84  The Digestive System | 81
chunk token count: 7.25 | text: Human Nutrition: 2020 Edition
chunk token count: 9.5 | text: 742 | Building Healthy Eating Patterns


In [100]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding our text chunks

In [101]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")

In [103]:
%%time

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 3.81 µs


In [108]:
# embedding_model = embedding_model.to("mps")

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [111]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [112]:
%%time

text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

CPU times: user 1min 4s, sys: 40.5 s, total: 1min 44s
Wall time: 53.2 s


In [130]:
text_chunk_embeddings = text_chunk_embeddings.cpu().numpy()

In [131]:
for i, item in enumerate(pages_and_chunks_over_min_token_len):
    item["embedding"] = text_chunk_embeddings[i]

In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(emebddings_df_save_path, index=False)