In [1]:
import requests
import pdfplumber
import fitz  # PyMuPDF
import os

In [2]:
def download_pdf(url, output_path):
    """Download a PDF file from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        with open(output_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully to {output_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

In [3]:
def extract_text_and_tables(pdf_path, output_txt_path):
    """Extract text and tables from a PDF file and save them into a text file."""
    with pdfplumber.open(pdf_path) as pdf, open(output_txt_path, 'w', encoding='utf-8') as output_file:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            tables = page.extract_tables()
            
            # Write extracted text
            output_file.write(f"Page {page_num}\n")
            output_file.write("Text:\n")
            output_file.write(text if text else "No text found on this page.\n")
            output_file.write("\n")
            
            # Write extracted tables
            if tables:
                output_file.write("Tables:\n")
                for table in tables:
                    for row in table:
                        output_file.write("\t".join(row) + "\n")
                    output_file.write("\n")
            else:
                output_file.write("No tables found on this page.\n")
            output_file.write("\n" + "="*50 + "\n")
    print(f"Text and tables saved to {output_txt_path}")

In [4]:
def extract_images(pdf_path, images_dir):
    """Extract images from a PDF file and save them to a directory."""
    os.makedirs(images_dir, exist_ok=True)
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = os.path.join(images_dir, f"page_{page_num+1}_img_{img_index+1}.{image_ext}")
            with open(image_path, 'wb') as image_file:
                image_file.write(image_bytes)
            print(f"Saved image to {image_path}")
    print(f"All images extracted to {images_dir}")

In [5]:
# Main Script
pdf_url = "https://www.vistajet.com/globalassets/documents/jettravelerreport.pdf"  # Replace with the actual URL of the PDF
pdf_path = "sample.pdf"
output_txt_path = "pymupdf-test.txt"
images_dir = "images"

In [6]:
# Download PDF
download_pdf(pdf_url, pdf_path)

PDF downloaded successfully to sample.pdf


In [7]:
# Extract text and tables
extract_text_and_tables(pdf_path, output_txt_path)

Text and tables saved to pymupdf-test.txt


In [8]:
# Extract images
extract_images(pdf_path, images_dir)

Saved image to images\page_1_img_1.jpeg
Saved image to images\page_1_img_2.jpeg
Saved image to images\page_4_img_1.jpeg
Saved image to images\page_5_img_1.jpeg
Saved image to images\page_5_img_2.jpeg
Saved image to images\page_6_img_1.jpeg
Saved image to images\page_7_img_1.jpeg
Saved image to images\page_8_img_1.jpeg
Saved image to images\page_9_img_1.jpeg
Saved image to images\page_10_img_1.jpeg
Saved image to images\page_10_img_2.jpeg
Saved image to images\page_11_img_1.jpeg
Saved image to images\page_13_img_1.jpeg
Saved image to images\page_15_img_1.jpeg
Saved image to images\page_17_img_1.jpeg
Saved image to images\page_19_img_1.jpeg
Saved image to images\page_21_img_1.jpeg
Saved image to images\page_21_img_2.jpeg
Saved image to images\page_22_img_1.jpeg
Saved image to images\page_23_img_1.jpeg
Saved image to images\page_24_img_1.jpeg
Saved image to images\page_25_img_1.jpeg
Saved image to images\page_26_img_1.jpeg
Saved image to images\page_27_img_1.jpeg
Saved image to images\pag