[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pdf-tools/components-code-sample-hub/blob/main/jupyter/pdftools_toolbox/pdftools_toolbox_image_extraction.ipynb)

In [None]:
%pip install https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/productkits/PDFSDKXT/latest/pdftools_toolbox-latest.tar.gz
%pip install ipython

# Extract all images and image masks from a PDF
Extract the embedded image data as JPEG or TIFF,
depending on the compression format used.

In [None]:
import io
import io
import os
from pdftools_toolbox.pdf import Document, Page
from pdftools_toolbox.pdf.content import ContentExtractor, ImageElement, ImageMaskElement, ImageType

In [None]:
# Download a file from a given URL and save it to the local system
def prepare_file(url: str, path: str):
    import requests
    response = requests.get(url)
    response.raise_for_status()

    with open(path, 'wb') as f:
        f.write(response.content)

In [None]:
# Set input arguments
input_url = 'https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/samples/testfiles/ImageCollection.pdf'
input_file_path = 'ImageCollection.pdf'
prepare_file(input_url, input_file_path)
output_dir = 'extracted_images'

In [None]:
def extract_image(image_element: ImageElement, output_path: str):
    with open(output_path, "wb+") as out_stream:
        image_element.image.extract(out_stream)

In [None]:
def extract_image_mask(image_mask_element: ImageMaskElement, output_path: str):
    with open(output_path, "wb+") as out_stream:
        image_mask_element.image_mask.extract(out_stream)

In [None]:
def process_page_content(page: Page, page_number: int, output_dir: str):
    extractor = ContentExtractor(page.content)
    img_count = 0
    mask_count = 0

    for content_element in extractor:
        # Extract image elements
        if isinstance(content_element, ImageElement):
            img_count += 1
            image_type = content_element.image.default_image_type

            extension = ".jpg" if image_type == ImageType.JPEG else ".tiff"

            output_path = os.path.join(output_dir, f"image_page{page_number}_{img_count}{extension}")

            extract_image(content_element, output_path)

            print(f"Extracted image: {output_path}")

        # Extract image masks
        elif isinstance(content_element, ImageMaskElement):
            mask_count += 1
            output_path = os.path.join(output_dir, f"image_mask_page{page_number}_{mask_count}.tiff")
            extract_image_mask(content_element, output_path)
            print(f"Extracted image mask: {output_path}")

In [None]:
try:
    # Set and check license key. If the license key is not valid, an exception is thrown.
    from pdftools_toolbox.sdk import Sdk
    Sdk.initialize("<PDFSDK,V1,MGAASQD6L2JMQHL54PK08RQX8GG4SS0M8DAHVPH0VMP3NB8R9DUK>", None)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Open input document
    with io.FileIO(input_file_path, "rb") as in_stream:
        with Document.open(in_stream, None) as in_doc:
    
            for page_number, page in enumerate(in_doc.pages, start=1):
                process_page_content(page, page_number, output_dir)

    print("Execution successful.")
except Exception as e:
    print(f"An error occurred: {e}")