[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pdf-tools/components-code-sample-hub/blob/main/jupyter/pdftools_toolbox/pdftools_toolbox_text_extraction.ipynb)

In [None]:
%pip install https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/productkits/PDFSDKXT/latest/pdftools_toolbox-latest.tar.gz
%pip install ipython

# Extract all text from PDF
Write text from PDF page by page to console. Determine
heuristically if two text fragments belong to the same
word.

In [None]:
import io
from pdftools_toolbox.pdf import Document
from pdftools_toolbox.pdf.content import ContentExtractor, Text, UngroupingSelection, TextElement

In [None]:
# Download a file from a given URL and save it to the local system
def prepare_file(url: str, path: str):
    import requests
    response = requests.get(url)
    response.raise_for_status()

    with open(path, 'wb') as f:
        f.write(response.content)

In [None]:
# Set input arguments
input_url = 'https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/samples/testfiles/InvoiceNone.pdf'
input_file_path = 'InvoiceNone.pdf'
prepare_file(input_url, input_file_path)

In [None]:
def write_text(text: Text):
    """Reconstruct text heuristically from text fragments."""
    text_part = []

    # Write all text fragments
    # Determine heuristically if there is a space between two text fragments
    for i_fragment, curr_fragment in enumerate(text):
        if i_fragment == 0:
            text_part.append(curr_fragment.text)
        else:
            last_fragment = text[i_fragment - 1]

            # Determine if there's a space between fragments
            if (curr_fragment.character_spacing != last_fragment.character_spacing or
                curr_fragment.font_size != last_fragment.font_size or
                curr_fragment.horizontal_scaling != last_fragment.horizontal_scaling or
                curr_fragment.rise != last_fragment.rise or
                curr_fragment.word_spacing != last_fragment.word_spacing):
                text_part.append(f" {curr_fragment.text}")
            else:
                current_bot_left = curr_fragment.transform.transform_rectangle(curr_fragment.bounding_box).bottom_left
                before_bot_right = last_fragment.transform.transform_rectangle(last_fragment.bounding_box).bottom_right

                if (before_bot_right.x < current_bot_left.x - 0.7 * curr_fragment.font_size or
                    before_bot_right.y < current_bot_left.y - 0.1 * curr_fragment.font_size or
                    current_bot_left.y < before_bot_right.y - 0.1 * curr_fragment.font_size):
                    text_part.append(f" {curr_fragment.text}")
                else:
                    text_part.append(curr_fragment.text)

    print("".join(text_part))

In [None]:
try:
    # Set and check license key. If the license key is not valid, an exception is thrown.
    from pdftools_toolbox.sdk import Sdk
    Sdk.initialize("<PDFSDK,V1,MGAASQD6L2JMQHL54PK08RQX8GG4SS0M8DAHVPH0VMP3NB8R9DUK>", None)

    # Open input document
    with open(input_file_path, "rb") as in_stream:
        with Document.open(in_stream, None) as in_doc:
            page_number = 1
    
            # Process each page
            for in_page in in_doc.pages:
                print(f"==========
Page: {page_number}
==========")
    
                extractor = ContentExtractor(in_page.content)
                extractor.ungrouping = UngroupingSelection.ALL
    
                # Iterate over all content elements and only process text elements
                for element in extractor:
                    if isinstance(element, TextElement):
                        write_text(element.text)
                page_number += 1

    print("Execution successful.")
except Exception as e:
    print(f"An error occurred: {e}")