In [1]:
import sys
sys.path.append('..')

In [2]:
import json

import pandas as pd
import pyocr
import pyocr.builders
import pytesseract
from PIL import Image
from tqdm import tqdm

from lib.data import load_all_invoices, load_extracted_samples
from lib.ocr import ImagePreprocessParams, ocr_images
from lib.text import extract_invoice

In [3]:
all_invoices = load_all_invoices()
len(all_invoices)

8411

In [4]:
samples = load_extracted_samples()
samples = [s for s in samples if len(s["text"]) == 0]

In [5]:
def run_evaluation(params):
    stats = {
        "detected_correctly": 0,
        "detected_incorrectly": 0,
        "detected_incorrectly_but_correct_in_text": 0,
        "not_detected": 0,
        "not_detected_but_in_text": 0,
    }
    not_detect_but_present_texts = []

    for sample in tqdm(samples):
        text = ocr_images(sample["images"], params)
        sample_filename = sample["filename"][:-4] # .pdf
        
        invoice = extract_invoice(text)

        def add_to_present_texts(prefix):
            pos = text.find(sample_filename)
            not_detect_but_present_texts.append(prefix + "|||" + sample_filename + "|||" + text[max(0, pos - 20):min(len(text), pos + 20)])

        if invoice is not None:
            # the invoice was detected

            # does it exist in the dataset?
            if invoice in all_invoices:
                stats["detected_correctly"] += 1
            elif sample_filename in text:
                stats["detected_incorrectly_but_correct_in_text"] += 1
                add_to_present_texts("wrong match (" + invoice + ")")
            else:
                stats["detected_incorrectly"] += 1
        else:
            # the invoice is not in the text

            if sample_filename in text:
                stats["not_detected_but_in_text"] += 1
                add_to_present_texts("no match")
            else:
                stats["not_detected"] += 1
    
    return stats, not_detect_but_present_texts

In [6]:
runs = []

for thr in [100, 130, 160, 180, 200]:
    params = ImagePreprocessParams(
        threshold_value=thr,
    )
    stats, not_detect_but_present_texts = run_evaluation(params)

    runs.append(stats | {
        "threshold_value": params.threshold_value,
    })

    with open("results.json", "w") as f:
        json.dump(runs, f, indent=2)

 12%|█▏        | 9/78 [00:06<00:48,  1.44it/s]