# PyMuPDF
#### Pulled every page successfully - however the colors in the images are inverted

In [1]:
import os
import fitz  # PyMuPDF
import io
from PIL import Image

In [30]:
# Output directory for the extracted images
output_dir = "extracted_imagesTrapept2"
# Desired output image format
output_format = "png"    # should I change the other images to png to keep the format consistent?
# Minimum width and height for extracted images
min_width = 100
min_height = 100
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [31]:
# file path you want to extract images from
file = "Trape & Mane 2006 pt 2.pdf"
# open the file
pdf_file = fitz.open(file)

In [32]:
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 9200}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

In [None]:
# I wonder if instead of numbers, the name of the file can be used? Would save some trouble :) 

In [14]:
print(image.mode) # just to make sure :) 