[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pdf-tools/components-code-sample-hub/blob/main/jupyter/pdftools_toolbox/pdftools_toolbox_file_extraction.ipynb)

In [None]:
%pip install pdftools_toolbox
%pip install ipython

# Extract files embedded from a PDF
Extract the embedded files contained in the PDF to the
file system.

In [None]:
import io
import os
from pdftools_toolbox.pdf import Document, FileReference

In [None]:
# Download a file from a given URL and save it to the local system
def prepare_file(url: str, path: str):
    import requests
    response = requests.get(url)
    response.raise_for_status()

    with open(path, 'wb') as f:
        f.write(response.content)

In [None]:
# Set input arguments
input_url = 'https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/samples/testfiles/BlankFilesEmbedded.pdf'
input_file_path = 'BlankFilesEmbedded.pdf'
prepare_file(input_url, input_file_path)
output_dir = 'extracted_files'

In [None]:
def copy_to_stream(data: io.IOBase, out_stream: io.IOBase, chunk_size: int = 4096):
    """Copy data from an IO stream to another."""
    while chunk := data.read(chunk_size):
        out_stream.write(chunk)

In [None]:
def extract_file(file_reference: FileReference, output_dir: str):
    # Remove null characters
    clean_file_name = file_reference.name.replace(chr(0), "")
    output_path = os.path.join(output_dir, clean_file_name)

    if file_reference.data is None:
        raise ValueError("The file_reference.data stream is None.")
    if not file_reference.data.readable():
        raise ValueError("The file_reference.data stream is not readable.")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    with io.FileIO(output_path, "wb") as out_file:
        copy_to_stream(file_reference.data, out_file)

In [None]:
try:
    # Set and check license key. If the license key is not valid, an exception is thrown.
    from pdftools_toolbox.sdk import Sdk
    Sdk.initialize("INSERT-LICENSE-KEY", None)

    # Open input document
    with io.FileIO(input_file_path, "rb") as in_stream:
        with Document.open(in_stream, None) as in_doc:
            file_ref_list = in_doc.all_embedded_files
            for file_ref in file_ref_list:
                extract_file(file_ref, output_dir)

    print("Execution successful.")
except Exception as e:
    print(f"An error occurred: {e}")