## Import PDF

In [35]:
from pathlib import Path
import requests

import pymupdf
from tqdm.auto import tqdm
import random

import pandas as pd

### Download PDF

In [28]:
def download_pdf(url, file_path):
    file_path = Path(file_path)
    if not file_path.exists():
        print(f'[INFO] File doesn\'t exist. Downloading: {url}')
        response = requests.get(url)
        if response.status_code == 200:
            with file_path.open('wb') as file:
                file.write(response.content)
            print(f'[INFO] The file has been downloaded: {file_path}')
        else:
            print(f'[INFO] Failed to download file. Status code: {response.status_code}')
    else:
        print(f'[INFO] File exists at {file_path.resolve()}')

In [29]:
file_path = 'human-nutrition-text.pdf'
url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
download_pdf(url, file_path)

[INFO] File exists at C:\Users\twoga\Documents\GitHub\Personal\rag-from-scratch\human-nutrition-text.pdf


### Open PDF

In [30]:
def text_formatter(text: str) -> str:
    
    cleaned_text = text.replace('\n', ' ').strip()
    
    return cleaned_text

def open_and_read_pdf(file_path, starting_num = 1):
    pdf_document = pymupdf.open(file_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(pdf_document), total=len(pdf_document)):
        text = text_formatter(page.get_text())
        pages_and_texts.append({'page_number': page_number - starting_num,
                                'page_char_count': len(text),
                                'page_word_count': len(text.split(' ')),
                                'page_sentence_count_raw': len(text.split('. ')),
                                'page_token_count': len(text) / 4, # rough estimate of tokens
                                'text': text})
    return pages_and_texts

In [31]:
pages_and_texts = open_and_read_pdf(file_path, 41)

  0%|          | 0/1208 [00:00<?, ?it/s]

In [33]:
random.sample(pages_and_texts, 2)

[{'page_number': 180,
  'page_char_count': 1072,
  'page_word_count': 195,
  'page_sentence_count_raw': 10,
  'page_token_count': 268.0,
  'text': 'Age Group  Adequate  Intake(mg/day)  Tolerable Upper Intake Level  (mg/day)  Infants (0–6  months)  120  ND  Infants (6–12  months)  370  ND  Children (1–3  years)  1,000  1,500  Children (4–8  years)  1,200  1,900  Children (9–13  years)  1,500  2,200  Adolescents (14–18  years)  1,500  2,300  Adults (19–50  years)  1,500  2,300  Adults (50–70  years)  1,300  2,300  Adults (> 70 years)  1,200  2,300  ND = not  determined  Source: Dietary Reference Intakes: Water, Potassium, Sodium,  Chloride, and Sulfate. Institute of Medicine. http:/ /www.iom.edu/ Reports/2004/Dietary-Reference-Intakes-Water-Potassium- Sodium-Chloride-and-Sulfate.aspx. Updated February 11, 2004.  Accessed September 22, 2017.  Food Sources for Sodium  Most sodium in the typical American diet comes from processed  and prepared foods. Manufacturers add salt to foods to impro

### Convert to Dataframe

In [36]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [37]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


## Preprocess Text Into Chunks