In [29]:
import boto3
import tabula
import os
import logging
import base64
import faiss
import json
import fitz
import warnings
import pymupdf
import numpy as np
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from IPython.display import display

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

warnings.filterwarnings("ignore")

In [30]:
file_name = "71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf"
file_path = os.path.join("data", file_name)


In [31]:
def create_directories(base_dir):
    directories = ["image", 
                   "text", 
                   "tables", 
                   "page_images"
                   ]
    for dir in directories:
        os.makedirs(os.path.join(base_dir, dir), exist_ok=True)

def extract_tables(doc, page_num, base_dir, items):
    try:
        tables = tabula.read_pdf(doc, pages=page_num, multiple_tables=True)
        
        if not tables:
            return
        
        for idx, table in enumerate(tables):
            tables_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
            table_file_name = f"{base_dir}/tables/{os.path.basename(file_path)}_table_{page_num}_{idx}.txt"
            
            with open(table_file_name, "w", encoding="utf-8") as f:
                f.write(tables_text)
            item = {
                "page": page_num,
                "type": "table",
                "text": tables_text,
                "path": table_file_name
            }
            items.append(item)
    except Exception as e:
        print(f"Error extracting tables from page {page_num}: {e}")

def extract_text_chunks(text, text_splitter, page_num, base_dir, items):
    chunks = text_splitter.split_text(text)
    
    for idx, chunk in enumerate(chunks):
        chunk_file_name = f"{base_dir}/text/{os.path.basename(file_path)}_text_{page_num}_{idx}.txt"
        
        with open(chunk_file_name, "w", encoding="utf-8") as f:
            f.write(chunk)
        item = {
            "page": page_num,
            "type": "text",
            "text": chunk,
            "path": chunk_file_name
        }
        items.append(item)
 
def extract_images(doc, page, page_num, base_dir, items):
    images = page.get_images()
    
    for idx, image in enumerate(images):
        xref = image[0]
        pix = pymupdf.Pixmap(doc, xref)  
        
        if pix.colorspace and (pix.n > 3 or pix.alpha):
            pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                
        image_name = f"{base_dir}/image/{os.path.basename(file_path)}_image_{page_num}_{idx}.png"    
        pix.save(image_name)
        
        with open(image_name, "rb") as f:
            embedding_image = base64.b64encode(f.read()).decode("utf-8")
        item = {
            "page": page_num,
            "type": "image",
            "path": image_name,
            "image": embedding_image
        }
        items.append(item)

def extract_page_images(page, page_num, base_dir, items):
    pix = page.get_pixmap()
    page_path = os.path.join(base_dir,f"page_images/page_{page_num:03d}.png")
    pix.save(page_path)
    
    with open(page_path, "rb") as f:
        embedding_image = base64.b64encode(f.read()).decode("utf-8")
    item = {
        "page": page_num,
        "type": "page_image",
        "path": page_path,
        "page_path": embedding_image
    }
    items.append(item)

In [None]:
doc = pymupdf.open(file_path)
num_pages = len(doc)
base_dir = "processed_data"

create_directories(base_dir)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 700,
    chunk_overlap= 200,
    length_function= len,
)
items = []

for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
    page = doc[page_num]
    text = page.get_text()
    extract_tables(file_path, page_num, base_dir, items)
    extract_text_chunks(text, text_splitter, page_num, base_dir, items)
    extract_images(doc, page, page_num, base_dir, items)
    extract_page_images(page, page_num, base_dir, items)


Processing PDF pages:   0%|          | 0/637 [00:00<?, ?it/s]'pages' argument isn't specified.Will extract only from page 1 by default.


Processing PDF pages: 100%|██████████| 637/637 [07:17<00:00,  1.46it/s]


In [34]:
[i for i in items if i["type"] == "tables"][:5]

[]

In [35]:
[i for i in items if i["type"] == "text"][:5]

[{'page': 1,
  'type': 'text',
  'text': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION',
  'path': 'data/text/71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf_text_1_0.txt'},
 {'page': 2,
  'type': 'text',
  'text': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJ A C Q U E L I N E  L .  L O N G E ,  E D I T O R\nD E I R D R E  S .  B L A N C H F I E L D ,  A S S O C I AT E  E D I T O R\nV O L U M E\nA-B\n1',
  'path': 'data/text/71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf_text_2_0.txt'},
 {'page': 3,
  'type': 'text',
  'text': 'STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V. Young, Project Manager, I

In [36]:
[i for i in items if i["type"] == "image"][:5]

[{'page': 0,
  'type': 'image',
  'path': 'data/image/71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf_image_0_0.png',
  'image': 'iVBORw0KGgoAAAANSUhEUgAAAroAAAPBCAIAAACeDZ74AAAACXBIWXMAAA7EAAAOxAGVKw4bAAtsnUlEQVR4nOydh1sbV/b3Bbg7juPeK+69NzAd9S7AdopLYjuOHdtxN6BKt5Ns+ua3u9lNssluerKx44IBIQmwk/eveu+5Z+7VSJorGCIQZZ7n++gRWBbSzJ17PnOqTu/rzogM9eE0MnrDRl83kcnXbfaDLD5lwT8FIrZg1B4CkSfkR/JLayBChM+JTPTdQP60Yi8zecOoTH1ftTIKJDo+om+kp1/ZRI4DOTJU5ImJHpzBHJ8/rUqBRN9X9D78BCVppJ1HTZo0jR+Ve0GVXulHg3gn72e/Fe//ei8V+4u4f5rrukD1IBOzDsbMfS/R54lvvInfiJgYIl2m/nw/rED/vGnAuGClxEDETeAYxgXF46PhgoYLmjRpyq6QFRAXDEOCC11oJfWJexqCArLCGMQFCZFSxG1hgmuBGDbyqKTxhgui45MxXFC7rFVKL5CGC5o0aRrtqqDC55wV0u2uKnHBRICgvsuIYubSwHAh1Xhn6nupxQUzsSyZxIW0H4vfN1uoSbMyq5Yqk8zBwO0f+eWYxQXB8RktuJAxpf+cI+Y8atKkafwIfaXSLRDZV6nM6m+HRK/nEQc5HCTfMslen6nvpRoXqIYDF1JtYRpcMCa+mFu+MYwLisdHwwUNFzRp0pRdKbJCJnEhhRXS79uZ+l4jFBfkDnarLMqAzo1U8Sh+gtnzhscqLoiOT6ZwoR8n/5+WSSCDP6IoDRc0adI0asQ2LpNfslB4U5cxXPCGJbH7Rvn+mfp/M/W91OICpgpkDBdEKW/xZAWZLbSHopg6ka

In [37]:
[i for i in items if i["type"] == "page_images"][:5]

[]