In [1]:
import os
import requests

pdf_path = "human_nutrition_text.pdf"

if not os.path.exists(pdf_path):
  print("File to be downloaded")
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
  filename = pdf_path
  response = requests.get(url)
  if response.status_code == 200:
    with open(filename, "wb") as file:
      file.write(response.content)
    print(f"File {filename} has been downloaded and saved")
  else:
    print(f"File could not be downloaded. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} already exists")
  

File to be downloaded
File human_nutrition_text.pdf has been downloaded and saved


In [3]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
  cleaned_text = text.replace("\n", " ").strip()
  return cleaned_text

def read_pdf(pdf: str) -> list[dict]:
  doc = fitz.open(pdf)
  pages_and_text = []
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text=text)
    pages_and_text.append({"page_number": page_number - 43,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count": len(text.split(". ")),
                            "page_token_count": len(text)/4, # 1 token = 4 characters (assumption)
                            "text": text}) 
    return pages_and_text

pages = read_pdf(pdf = pdf)
pages[:2]

0it [00:00, ?it/s]


[{'page_number': -43,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'}]