In [1]:
import sys
sys.path.append('..')

In [2]:
import os
from IPython.display import display, clear_output
import ipywidgets as widgets
from PIL import Image
from pathlib import Path
import random

from lib.data import load_extracted_samples
from lib.ocr import OCRParams, ocr_sample
from lib.text import extract_total

In [3]:
samples = load_extracted_samples()
samples = [s for s in samples if len(s["text"]) == 0]
random.Random(42).shuffle(samples)

skip_files = set()

In [4]:
OCR_CACHE_PATH = Path("../data-ocr")

def find_next():
    params = OCRParams("easyocr", grayscale=False, threshold=None)


    for sample in samples:
        text = ocr_sample(sample, params)
        total = extract_total(text)
        
        if total is not None:
            total_file = OCR_CACHE_PATH / sample["filename"] / "total.txt"

            if total_file.exists():
                continue
            else:
                return total, sample
    
    return None

def report_ok(sample, total):
    total_file = OCR_CACHE_PATH / sample["filename"]
    os.makedirs(total_file, exist_ok=True)
    with open(total_file / "total.txt", "w") as f:
        f.write(str(total) + "\n")
    show_next()
    
def skip(sample):
    skip_files.add(sample["filename"])
    show_next()

def show_next():
    clear_output()
    total, sample = find_next()
    
    # image
    image_path = sample["images"][-1]
    img = widgets.Image(
      value=open(image_path, 'rb').read(),
      width=800,
      height=400,
    )
    
    # ok button
    button_ok = widgets.Button(description = 'TOTAL MATCH')
    button_ok.on_click(lambda _: report_ok(sample, total))
    
    # wrong button
    button_wrong = widgets.Button(description = 'TOTAL MISMATCH')
    button_wrong.on_click(lambda _: skip(sample))
    
    print(sample["filename"])
    print("TOTAL: " + f'{total:,}')
    display(button_ok)
    display(button_wrong)
    display(img)

In [5]:
show_next()

2023-05-18_1169.pdf
TOTAL: 24,651.87


Button(description='TOTAL MATCH', style=ButtonStyle())

Button(description='TOTAL MISMATCH', style=ButtonStyle())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03k\x00\x00\x04l\x08\x02\x00\x00\x00\x8fu\x99\xf1\x…