# Experiments on PDF data extraction

In [1]:
import pdfquery as pq
import pandas as pd
import xml.etree.ElementTree as ET
from pprint import pprint

In [2]:
pdf = pq.PDFQuery("kud.pdf")
pdf.load()

pdf.tree.write('kud.xml', pretty_print=True)
pdf

<pdfquery.pdfquery.PDFQuery at 0x202b9e4b210>

In [3]:
tree = ET.parse('kud.xml')
root = tree.getroot()

lines = {}

pages = root.findall(".//LTPage")

for page in pages: 
    page_num = int(page.get("pageid"))
    
    for line in page.findall(".//LTTextLineHorizontal"):
        y0 = float(line.get("y0"))
        index = "P" + str(page_num) + "L" + str(y0)

        text_element = line.text or line.find(".//LTTextBoxHorizontal").text

        if index in lines: 
            lines[index].append(text_element)
        else: 
            lines[index] = [text_element]

In [4]:
for line_key, line in lines.items(): 
    print(f"{line_key}: {line}")

P1L722.88: ['Danske Bank ', 'NICOLAS MATTEAZZI ']
P1L711.6: ['Holmens Kanal Afdeling ']
P1L700.08: ['Holmens Kanal 2 ']
P1L688.8: ['1090 København K ']
P1L677.28: ['Telefon 45 12 60 00 ']
P1L666.0: ['SWIFT-BIC: DABADKKK ']
P1L654.48: ['www.danskebank.dk ']
P1L631.68: ['31. december 2022 ']
P1L608.88: ['Reg.nr. 4001 ']
P1L596.4: ['Konto 4002730456 ']
P1L584.88: ['Kontoen føres i danske kroner ']
P1L562.08: ['Side 1 af 17 ']
P1L503.007: ['Indestående ', 'Indsat ', 'Bogført ', 'Rente- ']
P1L492.207: ['Gæld ', 'Hævet ', 'dato ', 'dato ']
P1L468.447: ['494,42 - ', '3.142.391,68 ', '+ ', 'Gjensidige Forsikring, D ', '03.10 ', '03.10 ']
P1L457.887: ['517,00 - ', '3.141.874,68 ', '+ ', 'HK, Handels-og Kontorf. ', '03.10 ', '03.10 ']
P1L447.327: ['2.474,00 - ', '3.139.400,68 ', '+ ', 'Krogsgaard Finans ', '03.10 ', '03.10 ']
P1L437.007: ['661,85 - ', '3.138.738,83 ', '+ ', 'SuperB Solrød )))) 38962 ', '03.10 ', '03.10 ']
P1L426.447: ['4,95 - ', '3.138.733,88 ', '+ ', 'SuperB Solrød )))) 38965 '

In [8]:
import re

def process_array(input_array):
    # Join the array elements into a single string for easier pattern matching
    array_string = ' '.join(input_array)

    # Define regular expressions to extract amount, text, and date
    amount_match = re.search(r'([-+]?\d[\d,\.]+)\s*-\s*', array_string)
    text_match = re.search(r'\+\s*(.*?)\s*\\', array_string)
    date_match = re.search(r'(\d{2}\.\d{2}|\d{1,2}\.\d{2}\.\d{4})', array_string)

    # Check if all required information is present
    if amount_match and text_match and date_match:
        # Extract matched values
        amount = float(amount_match.group(1).replace(',', '').replace('.', ''))
        text = text_match.group(1).strip()
        date = date_match.group()

        # Check if the amount should be negated
        if array_string.startswith('-'):
            amount *= -1

        # Construct and return the result in JSON format
        result = {'amount': amount, 'text': text, 'date': date}
        return result

    return None  # Return None for invalid arrays


# Example arrays
arrays = [
    ['117,09 - ', '1.1113.051,66 ', '+ ', 'PAYPAL *UDEMY\\ \\35314369001 \\ ', '18.10 ', '18.10 '],
    ['+ ', '250,00 - ', '944,96 ', 'McDonalds Solroed, rest. \\ ', '08.11 ', '08.11 '],
    ['Udskriftsperiode: 01.10.2022 til 31.12.2022 '],
    ['KLARNA KLARNA '],
    ['Indestående ', 'Indsat ', 'Bogført ', 'Rente- '],
]

# Process each array and filter out invalid ones
result_list = [process_array(arr) for arr in arrays if process_array(arr) is not None]

# Print the result
for result in result_list:
    print(result)


{'amount': 11709.0, 'text': 'PAYPAL *UDEMY', 'date': '13.05'}
{'amount': 25000.0, 'text': '250,00 -  944,96  McDonalds Solroed, rest.', 'date': '08.11'}
