In [1]:
import sys
sys.path.append('..')

In [2]:
from tqdm import tqdm

from lib.data import load_all_invoices, load_extracted_samples
from lib.details_ocr import (extract_lines_ocr, filter_detail_lines,
                             split_detail_line)
from lib.details_text import extract_details_from_text
from lib.invoice import find_invoice_cached
from lib.line_classification import is_professional
from lib.ocr import get_ocrs

In [3]:
all_invoices = load_all_invoices()
len(all_invoices)

8411

In [4]:
# load samples
samples = load_extracted_samples()
len(samples)

8411

In [5]:
invoice_dict = {}
for invoice in all_invoices:
    invoice_dict[invoice] = None

for sample in tqdm(samples):
    filename = sample["filename"][:-4] # .pdf
    
    invoice = find_invoice_cached(sample)
    detail_lines = []
    
    if sample["text"] != "":
        detail_lines = extract_details_from_text(sample["text"])
    else:
        ocrs = get_ocrs(sample)
        if "paddleocr_deskew" in ocrs:
            all_lines = []
            for ocr_page in ocrs["paddleocr_deskew"]:
                all_lines = all_lines + extract_lines_ocr(ocr_page["boxes"])

            num_none = 0
            num_not_none = 0

            for line in filter_detail_lines(all_lines):
                if line is not None:
                    num_not_none += 1
                    detail_lines.append(split_detail_line(line))
                else:
                    num_none += 1

            if num_none / (num_none + num_not_none) > 0.3:
                detail_lines = []

    total = 0
    for line in detail_lines:
        if line is not None:
            if is_professional(line["desc"]):
                total += line["total"]

    if total == 0:
        total = None

    if invoice is not None and total is not None:
        invoice_dict[invoice] = total

100%|██████████| 8411/8411 [11:16<00:00, 12.43it/s]  


In [6]:
all_totals = [total for total in invoice_dict.values() if total is not None]
avg = sum(all_totals) / len(all_totals)
avg = round(avg, 2)
len(all_totals), len(all_totals) / len(samples), avg

(5249, 0.6240637260729996, 11013.37)

In [9]:
# write csv
with open("submission.csv", 'w') as f:
    f.write('Invoice Number,Total Charged\n')
    for invoice, total in invoice_dict.items():

        if total == None:
            total = 9783.46 # avg

        f.write(f'{invoice},{total}\n')