In [7]:
import pandas as pd
import pdfplumber
import os
import numpy as np
import re
from tqdm.notebook import tqdm
from lxml import etree
from collections import Counter

# Preprocessing work

In [2]:
ocrdata = pd.read_csv('../data/covid19wob_files_df.tar.xz')

In [22]:
def clean_split(text):
    
    
    splitted = re.split('\W+', text)
    cleaned = [word.lower() for word in splitted if word]
     
    return cleaned

def count_words(text):
    
    if not isinstance(text, str):
        return 0

    corpus = clean_split(text)
    
    return len(corpus)

def count_characters(text):
    
    if not isinstance(text, str):
        return 0
    
    corpus = clean_split(text)
    
    return sum([len(word) for word in corpus])

def create_bow(text):
    
    if not isinstance(text, str):
        return {}
    
    cleaned = clean_split(text)
    
    return Counter(cleaned)

In [4]:
ocrdata['nr_words'] = ocrdata['text'].map(lambda x: count_words(x))
ocrdata['nr_chars'] = ocrdata['text'].map(lambda x: count_characters(x))

In [None]:
bows_ocr = {}

for index, row in ocrdata[:-1].iterrows():
    bows_ocr[str(row['name']) + '-' + str(int(row['page']))] = create_bow(row['text'])

In [15]:
import csv

dirpath = '../data/XMLs//'

bows = {}
for xmlfile in os.listdir(dirpath):

    root = etree.parse(dirpath+xmlfile)
    pages = root.xpath('//page')

    for pagenumber, page in enumerate(pages):

        bows[xmlfile.strip('.xml') + '-' + str(pagenumber+1)] = create_bow(' '.join(page.xpath('text/text()')))


In [55]:
bows_ocr = {key: bows_ocr[key] for key in bows_ocr if key in bows}
bows = {key: bows[key] for key in bows if key in bows_ocr}

# Let's compare!

In [56]:
# Should be an empty set!
print(set(bows_ocr) - set(bows))

print(len(bows_ocr), len(bows))

set()
30881 30881


In [63]:
empty_bags = [bag for bag in bows if len(bows[bag]) == 0]
empty_bags_ocr = [bag for bag in bows_ocr if len(bows_ocr[bag]) == 0]

print(f'Pages with no found words (pdf to text): {len(empty_bags)} ({round(len(empty_bags) / len(bows) * 100, 2)}%)')
print(f'Pages with no found words (ocr): {len(empty_bags_ocr)} ({round(len(empty_bags_ocr) / len(bows_ocr) * 100, 2)}%)')

Pages with no found words (pdf to text): 7550 (24.45%)
Pages with no found words (ocr): 345 (1.12%)


## How many increases/decreases?

In [110]:
# Increases
increases = {}

for page in bows:
    
    original = sum(bows[page].values())
    ocr = sum(bows_ocr[page].values())
    
    increases[page] = ocr - original
    
increases_ocr = pd.Series(increases)

In [93]:
print(f"Decreases: {(increases_ocr < 0).sum()} ({round((increases_ocr < 0).mean() * 100, 2)}%)")
print(f"Increases: {(increases_ocr > 0).sum()} ({round((increases_ocr > 0).mean() * 100, 2)}%)")
print(f"Same amount: {(increases_ocr == 0).sum()} ({round((increases_ocr == 0).mean() * 100, 2)}%)")

print(f"Highest increase: {increases_ocr.max()}. Biggest decrease: {increases_ocr.min()}")
print(f"Mean increase: {increases_ocr.mean()}")

increases_ocr.sort_values()

Decreases: 10559 (34.19%)
Increases: 16442 (53.24%)
Same amount: 3880 (12.56%)
Highest increase: 2021. Biggest decrease: -2310
Mean increase: 57.8344613192578


d8d9c5015c9ceb952052f29e1a27ed1f_inventarislijst-deel-1-1                                                           -2310
3fa52482438ed25b69b0be300baaf3c0_documenten-292                                                                     -1245
d8d9c5015c9ceb952052f29e1a27ed1f_inventarislijst-deel-2-1                                                           -1198
3fa52482438ed25b69b0be300baaf3c0_documenten-276                                                                      -936
3fa52482438ed25b69b0be300baaf3c0_documenten-313                                                                      -930
                                                                                                                     ... 
40f5564f839324b9af20c295dd261007_wob-documenten-79                                                                   1255
40f5564f839324b9af20c295dd261007_wob-documenten-78                                                                   1334
8a676a3415d986008b02572c

In [120]:
# Probleem: pdf to text gaat niet altijd lekker... Verklaart veel van de decreases
# Een decrease is dus eigenlijk vaak juist goed...
print(bows['3fa52482438ed25b69b0be300baaf3c0_documenten-276'].most_common(10))
print(bows_ocr['3fa52482438ed25b69b0be300baaf3c0_documenten-276'].most_common(10))

[('n', 165), ('e', 113), ('i', 96), ('l', 67), ('t', 60), ('m', 47), ('de', 45), ('d', 39), ('a', 35), ('k', 31)]
[('de', 31), ('van', 20), ('voor', 17), ('en', 12), ('in', 10), ('te', 10), ('dit', 9), ('het', 8), ('e', 7), ('een', 6)]


## Subsets

In [112]:
subsets = {}

for page in bows:
    
    subsets[page] = set(bows[page].keys()).issubset(set(bows_ocr[page].keys()))
    
subsets = pd.Series(subsets)

In [113]:
print(f"{subsets.sum()} ({round(subsets.sum() / subsets.count() * 100, 2)}%) docs are a subset of the ocr-ed doc")

10705 (34.67%) docs are a subset of the ocr-ed doc
