In [11]:
import os
import subprocess
from subprocess import list2cmdline


def _rm_on_fail(name, ground_truth_dir):
    print(f'rm {name} files (png, box, gt.txt, lstmf)')
    os.remove(os.path.join(ground_truth_dir, f'{name}.box'))
    os.remove(os.path.join(ground_truth_dir, f'{name}.jpg'))
    os.remove(os.path.join(ground_truth_dir, f'{name}.gt.txt'))
    os.remove(os.path.join(ground_truth_dir, f'{name}.lstmf'))


def _run_prepare(name, ground_truth_dir, tesstrain_dir):
    with open(os.path.join(ground_truth_dir, f'{name}.box'), "w") as box_file:
        with open(os.path.join(ground_truth_dir, f'{name}.gt.txt'), "r+") as f:
            # strip string
            line = f.readline().strip()
            f.seek(0)
            f.write(line)
            f.truncate()

        cmd1 = [
            "PYTHONIOENCODING=utf-8", "python3", os.path.join(tesstrain_dir, "generate_line_box.py"),
            "-i", os.path.join(ground_truth_dir, f'{name}.jpg'),
            "-t", os.path.join(ground_truth_dir, f'{name}.gt.txt'),
        ]
        cmd1 = list2cmdline(cmd1)
        try:
            subprocess.run(cmd1, shell=True, stdout=box_file, text=True)
        except Exception as e:
            print(e)
            _rm_on_fail(name, ground_truth_dir)
            

    # Команда для tesseract
    cmd2 = [
        "tesseract",
        os.path.join(ground_truth_dir, f"{name}.jpg"),
        os.path.join(ground_truth_dir, f"{name}"),
        "--psm", "13", "lstm.train"
    ]
    cmd2 = list2cmdline(cmd2)
    
    try:
        subprocess.run(cmd2, shell=True, text=True)
    except Exception as e:
        print(e)
        _rm_on_fail(name, ground_truth_dir)


In [12]:
import concurrent
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

par_factor = 8

def run_prepare(boxes, ground_truth_dir, tesstrain_dir):
    with tqdm(total=len(boxes)) as pbar, ThreadPoolExecutor(max_workers=par_factor) as executor:
        futures = [
            executor.submit(
                _run_prepare, 
                b, 
                ground_truth_dir,
                tesstrain_dir,
            )
            for b in boxes
        ]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
                pbar.update(1)
            except Exception as e:
                raise e

In [13]:
ground_truth_dir = '/Users/panagoa/PycharmProjects/zbze_ocr/tesseract/finetune_boxes/output'
ground_truth_files = os.listdir(ground_truth_dir)

txt = [f for f in ground_truth_files if f.endswith('.txt')]
img = [f for f in ground_truth_files if f.endswith('.jpg')]
box = [f for f in ground_truth_files if f.endswith('.box')]
lstmf = [f for f in ground_truth_files if f.endswith('.lstmf')]

print(len(txt), len(img), len(box), len(lstmf))

# wo_boxes = set([f.replace('.jpg', '') for f in img]).difference([f.replace('.lstmf', '') for f in lstmf])
wo_boxes = set([f.replace('.jpg', '') for f in img])
print(len(wo_boxes))
run_prepare(wo_boxes, ground_truth_dir, tesstrain_dir='../../tesstrain')

3073 3072 3073 3072
3072


100%|██████████| 3072/3072 [01:09<00:00, 43.95it/s]
