In [None]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

In [None]:
import os
import requests
pdf_path = "human-nutrition-text.pdf"

In [None]:
if not os.path.exists(pdf_path):
  print("File dose not exist. Downloading...")

  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename = pdf_path

  response = requests.get(url)  

  if response.status_code == 200:

    with open(filename,"wb") as file:
      file.write(response.content)
    print(f"The file has been downloaded and saved as {pdf_path}")
  else:
    print(f"Failed to download the file. Status code is {response.statys_code}")
else:
  print(f"File {pdf_path} Already Exists")

In [None]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text : str) -> str:
  cleaned_text = text.replace("\n"," ").strip()
  return cleaned_text

def open_and_read_pdf(pdf_path : str) -> list[dict]:
  doc = fitz.open(pdf_path)
  pages_and_texts = []

  for page_number,page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text)
    pages_and_texts.append(
        {"page_number":page_number - 41,
         "page_char_count":len(text),
         "page_word_count":len(text.split(" ")),
         "page_sentence_raw_count":len(text.split(". ")),
         "page_token_count": len(text)/4,
         "text": text})
  return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path)
pages_and_texts[4]


In [None]:
import random

random.sample(pages_and_texts,k=2)

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")

In [None]:
for item in tqdm(pages_and_texts):
  item["sentences"] = list(nlp(item["text"]).sents)
  item["sentences"] = [str(sentence) for sentence in item["sentences"]]

  item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
num_sentence_chunk_size = 10

def split_test(input_list : list, slice_size:int) -> list[list[str]]:

  return [input_list[ i : i + slice_size] for i in range( 0 , len(input_list),slice_size)]


for item in tqdm(pages_and_texts):
  item["sentence_chunks"] = split_test(item["sentences"],num_sentence_chunk_size)

  item["num_chunks"] = len(item["sentence_chunks"])


In [None]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_texts):
  for sentence_chunk in item["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]
    joined_sentence_chunk = " ".join(sentence_chunk).replace("  "," ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1',joined_sentence_chunk)
    chunk_dict['sentence_chunk'] = joined_sentence_chunk

    chunk_dict['chunk_character_count'] = len(joined_sentence_chunk)
    chunk_dict['chunk_word_count'] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict['chunk_token_count'] = len(joined_sentence_chunk)/4

    pages_and_chunks.append(chunk_dict)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

In [None]:
min_token_length = 30

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]