# Experiments on PDF data extraction

In [116]:
import pdfquery as pq
import pandas as pd
import xml.etree.ElementTree as ET
from pprint import pprint
import re
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

In [8]:
pdf = pq.PDFQuery("kud.pdf")
pdf.load()

pdf.tree.write('kud.xml', pretty_print=True)
pdf

<pdfquery.pdfquery.PDFQuery at 0x11854b880>

In [11]:
def extract_lines(xmlFilename): 
    '''Extract all lines of text present in the XML (representing a PDF)
    
    Parameters: 
    xmlFilename (string): the file name of the xml to load. The XML is the PDF converted into XML.

    Returns: 
    dict: a dictionnary where the key is the identifier (page, line) (e.g. P1L670.11). Each element is a list of tokens belonging on the same line and page
        Example of keys:
        P1L503.007: ['Indestående ', 'Indsat ', 'Bogført ', 'Rente- ']
        P1L492.207: ['Gæld ', 'Hævet ', 'dato ', 'dato ']
        P1L468.447: ['494,42 - ', '3.291,68 ', '+ ', 'Gjensidige Forsikring, D ', '03.10 ', '03.10 ']

    '''

    tree = ET.parse(pdfFileName)
    root = tree.getroot()

    lines = {}

    pages = root.findall(".//LTPage")

    for page in pages: 
        page_num = int(page.get("pageid"))
        
        for line in page.findall(".//LTTextLineHorizontal"):
            y0 = float(line.get("y0"))
            index = "P" + str(page_num) + "L" + str(y0)

            text_element = line.text or line.find(".//LTTextBoxHorizontal").text

            if index in lines: 
                lines[index].append(text_element)
            else: 
                lines[index] = [text_element]

    return lines;

In [23]:
number_pattern = re.compile(r'^(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)\s?([+-]?)$')

def clean_numbers(line): 
    '''Cleans the numbers contained in a given line.
    Cleaning up numbers means moving from '494,42 - ' to a float -494.42

    Parameters: 
    line (list): the list of tokens of the line

    Returns: 
    list: the cleaned line
    '''
    clean_line = []

    for token in line: 

        match = number_pattern.match(token.strip())

        if match:
            numeric_part, sign = match.groups()
            numeric_part = numeric_part.replace('.', '').replace(',', '.')  # Replace comma with period for decimal part
            number = float(numeric_part)
            if sign == '-':
                number = -number
            
            clean_line.append(number)
        else: 
            clean_line.append(token)

    return clean_line

In [97]:
useless_tokens_pattern = re.compile(r'^\s*\+\s*$')

def remove_useless_tokens(line):
    '''Removes tokens that are not needed.
    That is, for example, tokens ' +'
    Remove duplicate tokens

    Parameters: 
    line (list): the line as a list of tokens

    Returns:
    list: the clean line
    '''
    clean_line = []

    for token in line: 

        if isinstance(token, (int, float)): 
            clean_line.append(token)
            continue
        
        match = useless_tokens_pattern.match(token)

        if not match: 
            clean_line.append(token)


    return clean_line

In [109]:
def clean_date(line): 
    '''Cleans the date

    Parameters: 
    line (list): the line as a list of tokens

    Returns:
    list: the clean line
    '''
    clean_line = []

    for token in line: 

        if isinstance(token, (int, float)): 
            clean_line.append(token)
            continue

        clean_line.append(token.strip())
    
    return clean_line

In [133]:
tokenizer = RegexpTokenizer(r'\w+')
date_pattern = re.compile(r'\d{2}\.\d{2}\s*$')

def clean_text(line): 
    '''Cleans the text, only keeping words

    Parameters: 
    line (list): the line as a list of tokens

    Returns:
    list: the clean line
    '''
    clean_line = []

    for token in line: 

        if isinstance(token, (int, float)): 
            clean_line.append(token)
            continue

        if date_pattern.match(token):
            clean_line.append(token)
            continue

        text_tokens = tokenizer.tokenize(token)
        clean_text = ' '.join(text_tokens)

        clean_line.append(clean_text)
    
    return clean_line

In [134]:
def clean_lines(lines): 
    '''Takes the lines and cleans up the numbers and the text
    Cleaning up numbers means moving from '494,42 - ' to a float -494.42
    Cleans up dates (e.g. trims)
    '''

    pattern = re.compile(r'^(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)\s?([+-]?)$')

    for key in lines.keys():

        # Clean the numbers
        lines[key] = clean_numbers(lines[key])

        # Remove useless tokens
        lines[key] = remove_useless_tokens(lines[key])

        # Cleans dates
        lines[key] = clean_date(lines[key])

        # Cleans the text description
        lines[key] = clean_text(lines[key])

    return lines

In [135]:
def filter_lines(lines):
    '''Filter the lines, keeping only the ones corresponding to payments
    
    Parameters: 
    lines (dict): the dictionnary of lines

    Returns: 
    dict: the updated dictionnary
    '''
    clean_lines = {}    

    for key, line in lines.items(): 

        # Count how many numbers this line contains. 
        # If the line contains >= 2 numbers, then it's a valid line, to keep
        num_numbers = sum(isinstance(item, (int, float)) for item in line)

        if num_numbers >= 2: 
            clean_lines[key] = line
    
    return clean_lines

In [136]:
def keep_tokens(lines): 
    '''Filter the tokens, keeping only the ones that are needed
    Removes the saldo
    Removes duplicate dates
    
    Parameters: 
    lines (dict): the dictionnary of lines

    Returns: 
    dict: the updated dictionnary
    '''
    clean_lines = {}    

    for key, line in lines.items(): 

        smallest_number = 10**20
        clean_line = []

        for token in line: 
            
            if isinstance(token, (int, float)): 
                if token < smallest_number: 
                    smallest_number = token
                continue
            
            # Check that the token is not already in the list (eliminate duplicates)
            if not token in clean_line: 
                clean_line.append(token)
        
        clean_line.append(smallest_number)

        clean_lines[key] = clean_line
    
    return clean_lines


In [140]:
# Extract lines
lines = extract_lines("kud.xml")

# Clean up lines
lines = clean_lines(lines)

# Filter lines
lines = filter_lines(lines)

# Remove tokens that are not needed
lines = keep_tokens(lines)

# Create the final list
data = []
for line_key, line in lines.items(): 
    data.append(line)
    print(line)


['Gjensidige Forsikring D', '03.10', -494.42]
['HK Handels og Kontorf', '03.10', -517.0]
['Krogsgaard Finans', '03.10', -2474.0]
['SuperB Solrød 38962', '03.10', -661.85]
['SuperB Solrød 38965', '03.10', -4.95]
['Q8 Service 8042 96109', '03.10', -620.44]
['Maxi Zoo Gentofte 31308', '03.10', -198.95]
['McDonalds Solroed rest', '04.10', -294.0]
['Google CLOUD LPWQM6 Milan', '04.10', -164.01]
['SuperB Solrød 06761', '04.10', -162.4]
['MobilePay Caroline Boje', '06.10', -328.0]
['7 Eleven B477 48210', '06.10', -81.0]
['SECURITAS A S', '07.10', -210.0]
['Fisketorvet Føtex 20767', '10.10', -868.6]
['Fisketorvet Føtex 21631', '10.10', -599.65]
['P huset Gl Mønt 48275', '10.10', -425.0]
['PAYPAL EASYPARK AS 35314369001', '11.10', -45.71]
['McDonalds Solroed rest', '11.10', -245.0]
['SuperB Solrød 45871', '11.10', -355.95]
['SuitSupply Copenhagen', '12.10', -3249.0]
['4011320147', '13.10', -306589.32]
['SuperB Solrød 47519', '13.10', -224.4]
['SuperB Solrød 50290', '17.10', -631.7]
['McDonalds 