In [1]:
import pandas as pd
import pdfplumber
import os
import numpy as np
import re
from tqdm.notebook import tqdm
from lxml import etree
from collections import Counter
import camelot
import PyPDF2

# Preprocessing work

In [2]:
ocrdata = pd.read_csv('../data/covid19wob_files_df.tar.xz')

In [3]:
def clean_split(text):
    
    
    splitted = re.split('\W+', text)
    cleaned = [word.lower() for word in splitted if word]
     
    return cleaned

def count_words(text):
    
    if not isinstance(text, str):
        return 0

    corpus = clean_split(text)
    
    return len(corpus)

def count_characters(text):
    
    if not isinstance(text, str):
        return 0
    
    corpus = clean_split(text)
    
    return sum([len(word) for word in corpus])

def create_bow(text):
    
    if not isinstance(text, str):
        return {}
    
    cleaned = clean_split(text)
    
    return Counter(cleaned)

In [4]:
ocrdata['nr_words'] = ocrdata['text'].map(lambda x: count_words(x))
ocrdata['nr_chars'] = ocrdata['text'].map(lambda x: count_characters(x))

In [4]:
bows_ocr = {}

for index, row in ocrdata[:-1].iterrows():
    bows_ocr[str(row['name']) + '-' + str(int(row['page']))] = create_bow(row['text'])

In [5]:
dirpath = '../data/txt/txt//'

bows = {}
for txtfile in tqdm(os.listdir(dirpath)):

    contents = open(dirpath+txtfile, encoding='utf-8').read()

    bows[txtfile.strip('.txt')] = create_bow(contents)


HBox(children=(FloatProgress(value=0.0, max=33905.0), HTML(value='')))




In [6]:
bows_ocr = {key: bows_ocr[key] for key in bows_ocr if key in bows}
bows = {key: bows[key] for key in bows if key in bows_ocr}

# Let's compare!

In [7]:
# Should be an empty set!
print(set(bows_ocr) - set(bows))

print(len(bows_ocr), len(bows))

set()
28331 28331


In [8]:
empty_bags = [bag for bag in bows if len(bows[bag]) == 0]
empty_bags_ocr = [bag for bag in bows_ocr if len(bows_ocr[bag]) == 0]

print(f'Pages with no found words (pdf to text): {len(empty_bags)} ({round(len(empty_bags) / len(bows) * 100, 2)}%)')
print(f'Pages with no found words (ocr): {len(empty_bags_ocr)} ({round(len(empty_bags_ocr) / len(bows_ocr) * 100, 2)}%)')

Pages with no found words (pdf to text): 6586 (23.25%)
Pages with no found words (ocr): 121 (0.43%)


## How many increases/decreases?

In [9]:
# Increases
increases = {}

for page in bows:
    
    original = sum(bows[page].values())
    ocr = sum(bows_ocr[page].values())
    
    increases[page] = ocr - original
    
increases_ocr = pd.Series(increases)

In [13]:
print(f"Decreases: {(increases_ocr < 0).sum()} ({round((increases_ocr < 0).mean() * 100, 2)}%)")
print(f"Increases: {(increases_ocr > 0).sum()} ({round((increases_ocr > 0).mean() * 100, 2)}%)")
print(f"Same amount: {(increases_ocr == 0).sum()} ({round((increases_ocr == 0).mean() * 100, 2)}%)")

print(f"Highest increase: {increases_ocr.max()}. Biggest decrease: {increases_ocr.min()}")
print(f"Mean increase: {increases_ocr.mean()}")

increases_ocr.sort_values()

Decreases: 10728 (37.87%)
Increases: 13798 (48.7%)
Same amount: 3805 (13.43%)
Highest increase: 2021. Biggest decrease: -1097
Mean increase: 47.5712470438742


e894904cd8de2892fa1143092d638832_bijlage-3-bij-besluit-op-wob-verzoek-over-contacten-bedrijfsleven-coronasteunmaatregelen-41   -1097
5d03dc8b8621ea5e53ad6fe7b44c8533_wob-verzoek-inz-coronamaatregelen-op-scholen-16-37                                            -1029
5d03dc8b8621ea5e53ad6fe7b44c8533_wob-verzoek-inz-coronamaatregelen-op-scholen-16-33                                            -1014
e894904cd8de2892fa1143092d638832_bijlage-2-bij-besluit-op-wob-verzoek-over-contacten-bedrijfsleven-coronasteunmaatregelen-1     -978
5d03dc8b8621ea5e53ad6fe7b44c8533_wob-verzoek-inz-coronamaatregelen-op-scholen-14-34                                             -946
                                                                                                                                ... 
40f5564f839324b9af20c295dd261007_wob-documenten-78                                                                              1334
8a676a3415d986008b02572cb9310894_besluit-covid-19-suriname-inclusief-

## Subsets

In [14]:
subsets = {}

for page in bows:
    
    subsets[page] = set(bows[page].keys()).issubset(set(bows_ocr[page].keys()))
    
subsets = pd.Series(subsets)

In [15]:
print(f"{subsets.sum()} ({round(subsets.sum() / subsets.count() * 100, 2)}%) docs are a subset of the ocr-ed doc")

9204 (32.49%) docs are a subset of the ocr-ed doc


In [21]:
import dutch_words

words = dutch_words.get_ranked()

In [26]:
dutch_words_counter = {}

for doc in tqdm(bows):
    
    ocr_dutch_words = sum([bows_ocr[doc][word] for word in bows_ocr[doc] if word in words])
    pdftotext_dutch_words = sum([bows[doc][word] for word in bows[doc] if word in words])

    dutch_words_counter[doc] = {'pdftotext': pdftotext_dutch_words, 'ocr': ocr_dutch_words}
    
dutch_words_df = pd.DataFrame(dutch_words_counter)

HBox(children=(FloatProgress(value=0.0, max=28331.0), HTML(value='')))




In [27]:
dutch_words_df.head()

Unnamed: 0,0068ed0b40cca6270f857d2614cc63c0_besluit-1,0068ed0b40cca6270f857d2614cc63c0_besluit-2,0068ed0b40cca6270f857d2614cc63c0_besluit-3,0068ed0b40cca6270f857d2614cc63c0_document-1,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-1,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-10,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-11,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-2,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-3,0272cdb141e62321341591f0959794a2_derde-deelbesluit-aantal-aanwezigen-in-kerken-4,...,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-90,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-91,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-92,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-93,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-94,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-95,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-96,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-97,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-98,ec59d18227ae89899be3f69f57879a60_samengevoegde-openbaar-te-maken-stukken-besluit-op-bezwaar-99
pdftotext,212,324,332,70,0,0,234,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ocr,216,324,329,73,272,119,234,331,439,445,...,91,45,98,64,103,102,100,40,104,58
