In [75]:
from transformers import LayoutLMv3Processor, LayoutLMForTokenClassification, LayoutLMv3ImageProcessor
from PIL import Image, ImageDraw, ImageFont
from difflib import SequenceMatcher
from pathlib import Path
import numpy as np
import pandas as pd
import json
import torch
import glob
from PIL import Image
from datasets import Dataset

In [76]:
example_img = Path('../data/img/00d0108922007.jpg')
example_ocr = Path('../data/ocr/00d0108922007.csv')

image = Image.open(example_img).convert("RGB")
# image.show()

In [77]:
# path: path to txt file
def txt_to_df(path):
    samples = []
    with open(path, 'r') as f:
        for line in f.readlines():
            split_lines = line.split(',', maxsplit=9)

            bbox = np.array([split_lines[0], split_lines[1], split_lines[4], split_lines[5]], dtype=np.int32)
            text = split_lines[8].strip()

            samples.append([path.stem, *bbox, text])

    df = pd.DataFrame(samples, columns=['fname', 'x0', 'y0', 'x2', 'y2', 'text'])

    return df


def normalize_box(x0, y0, x2, y2, width, height):
    x0 = int(1000 * (x0 / width))
    x2 = int(1000 * (x2 / width))
    y0 = int(1000 * (y0 / height))
    y2 = int(1000 * (y2 / height))

    return [x0, y0, x2, y2]

In [78]:
def data_generator():
    img_folder = Path("../data/img")
    bbox_folder = Path("../data/ocr/")

    img_files = [file for file in img_folder.glob("*.jpg")]

    files = [file for file in bbox_folder.glob("*.csv")]

    skipped_imgs = 0
    for img_file in img_files:
        data = {}

        fname = Path(img_file.name)
        bbox_path = bbox_folder / fname.with_suffix(".csv")

        if (not bbox_path.is_file()):
            skipped_imgs += 1
            continue

        df = txt_to_df(bbox_path)

        data['id'] = df.iloc[:1]['fname'].iat[0]
        data['words'] = []
        data['bboxes'] = []

        image = Image.open(img_file)
        width, height = image.size

        for i, row in df.iterrows():
            data['words'].append(row.at['text'])
            data['bboxes'].append(normalize_box(row.at['x0'], row.at['y0'], row.at['x2'], row.at['y2'], width, height))

        data['image'] = "../data/img/" + data['id'] + ".jpg"
        data['ner_tags'] = ["O" for i in range(len(data['words']))]

        # print(data)

        yield data

In [79]:
bill_test = Dataset.from_generator(data_generator)

Using custom data configuration default-38271dbc043f42d9
Found cached dataset generator (/Users/ben/.cache/huggingface/datasets/generator/default-38271dbc043f42d9/0.0.0)


In [80]:
from datasets import Image, Sequence, ClassLabel

bill_test = bill_test.cast_column("image", Image())
bill_test = bill_test.cast_column("ner_tags", Sequence(feature=ClassLabel(num_classes=5, names=["S-TOTAL", "S-DATE", "S-ADDRESS", "S-COMPANY", "O"], id=None), length=-1, id=None))

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [81]:
bill_test.features

{'id': Value(dtype='string', id=None),
 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'image': Image(decode=True, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['S-TOTAL', 'S-DATE', 'S-ADDRESS', 'S-COMPANY', 'O'], id=None), length=-1, id=None)}

In [82]:
bill_test.save_to_disk("bill_dataset/")

Saving the dataset (0/1 shards):   0%|          | 0/562 [00:00<?, ? examples/s]