[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pdf-tools/components-code-sample-hub/blob/main/jupyter/pdftools_sdk/pdftools_sdk_extract_text_layout.ipynb)

In [None]:
%pip install pdftools_sdk
%pip install ipython

# Extract text mimicing layout
Extracting text from a PDF page by page into text files,
preserving the original layout by adding whitespaces to
the monospace text.

In [None]:
import io
import os
from pdftools_sdk.pdf import Document
from pdftools_sdk.extraction import Extractor, TextOptions, TextExtractionFormat

In [None]:
# Download a file from a given URL and save it to the local system
def prepare_file(url: str, path: str):
    import requests
    response = requests.get(url)
    response.raise_for_status()

    with open(path, 'wb') as f:
        f.write(response.content)

In [None]:
# Set input arguments
input_url = 'https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/samples/testfiles/PdfPrimerWhitepaper.pdf'
input_path = 'PdfPrimerWhitepaper.pdf'
prepare_file(input_url, input_path)
output_dir = 'extracted_text'

In [None]:
def extract_text(input_file_path: str, output_directory: str):
    # Open input document
    with open(input_file_path, 'rb') as in_stream:
        with Document.open(in_stream) as in_doc:
            # Create directory if it doesn't exist
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            # Set extraction options
            options = TextOptions()
            options.extraction_format = TextExtractionFormat.MONOSPACE
            options.advance_width = 9.2

            # Extract text page by page
            extractor = Extractor()
            for i in range(in_doc.page_count):
                output_file = os.path.join(output_directory, f"page{i + 1}.txt")
                with open(output_file, 'wb') as out_stream:
                    extractor.extract_text(in_doc, out_stream, options, i + 1, i + 1)

In [None]:
try:
    # By default, a test license key is active. In this case, a watermark is added to the output. 
    # If you have a license key, please uncomment the following call and set the license key.
    # from pdftools_sdk.sdk import Sdk
    # Sdk.initialize("INSERT-LICENSE-KEY")

    extract_text(input_path, output_dir)

    print(f"Successfully extracted page-wise text from PDF to {output_dir}")
except Exception as e:
    print(f"An error occurred: {e}")