In [None]:
!pip install datasets

In [None]:
DATA_DIR = "/content/drive/MyDrive/OCR/data"

In [None]:
from datasets import load_dataset

import os
import ast

from tqdm.notebook import tqdm

In [None]:
#loading the dataset
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1")

Downloading readme:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/125 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/70 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 2043
    })
    test: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 125
    })
    valid: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 70
    })
})

## Understanding Data

In [None]:
parsed_data = dataset['train']['parsed_data'][0]
parsed_data

'{"xml": "", "json": "{\'header\': {\'invoice_no\': \'40378170\', \'invoice_date\': \'10/15/2012\', \'seller\': \'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\', \'client\': \'Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\', \'seller_tax_id\': \'958-74-3511\', \'client_tax_id\': \'998-87-7723\', \'iban\': \'GB77WRBQ31965128414006\'}, \'items\': [{\'item_desc\': \\"Leed\'s Wine Companion Bottle Corkscrew Opener Gift Box Set with Foil Cutter\\", \'item_qty\': \'1,00\', \'item_net_price\': \'7,50\', \'item_net_worth\': \'7,50\', \'item_vat\': \'10%\', \'item_gross_worth\': \'8,25\'}], \'summary\': {\'total_net_worth\': \'$7,50\', \'total_vat\': \'$0,75\', \'total_gross_worth\': \'$8,25\'}}", "kie": ""}'

In [None]:
raw_data = dataset['train']['raw_data'][0]
raw_data

'{"ocr_words": "[\'Invoice no: 40378170\', \'Date of issue:\', \'10/15/2012\', \'Seller:\', \'Client:\', \'Patel, Thompson and Montgomery\', \'Jackson, Odonnell and Jackson.\', \'356 Kyle Vista\', \'267 John Track Suite 841\', \'New James, MA 46228\', \'Jenniferville, PA 98601\', \'Tax Id: 958-74-3511\', \'Tax Id: 998-87-7723\', \'IBAN: GB77WRBQ31965128414006\', \'ITEMS\', \'UM\', \'No.\', \'Description\', \'Qty\', \'Net price\', \'Net worth\', \'VAT [%]\', \'Gross\', \'worth\', \\" Leed\'s Wine Companion Bottle\\", \'1,00\', \'each\', \'7,50\', \'7,50\', \'10%\', \'8,25\', \'Corkscrew Opener Gift Box Set\', \'with Foil Cutter\', \'SUMMARY\', \'VAT [%]\', \'VAT\', \'Net worth\', \'Gross worth\', \'7,50\', \'10%\', \'0,75\', \'8,25\', \'Total\', \'$7,50\', \'$ 0,75\', \'$8,25\']", "ocr_boxes": "[[[[196.0, 110.0], [801.0, 110.0], [801.0, 161.0], [196.0, 161.0]], (\'Invoice no: 40378170\', 0.9985853433609009)], [[[196.0, 212.0], [517.0, 212.0], [517.0, 259.0], [196.0, 259.0]], (\'Date of 

## Parsing Data

In [None]:
def get_ocr_boxes(raw_data):
    raw_data_dct = ast.literal_eval(raw_data)

    return ast.literal_eval(raw_data_dct['ocr_boxes'])

In [None]:
def get_txt_labels(boxes):
    txt_labels = []
    for box in boxes:
        txt_labels.append(box[1][0])

    return txt_labels

## Converting To YOLO format

In [None]:
def find_xy(coords):
    min_x, min_y = coords[0][0], coords[0][1]
    max_x, max_y = coords[0][0], coords[0][1]

    for idx in range(1, len(coords)):
        min_x = min(min_x, coords[idx][0])
        min_y = min(min_y, coords[idx][1])

        max_x = max(max_x, coords[idx][0])
        max_y = max(max_y, coords[idx][1])

    return (min_x, min_y, max_x, max_y)

In [None]:
# YOLO Format: class_id, x_center, y_center, w, h (all normalized)
def convert_to_yolo(boxes, img_w, img_h):
    yolo_boxes = []
    for box in boxes:
        coords = box[0]

        min_x, min_y, max_x, max_y = find_xy(coords)

        x_center = (min_x + max_x)/2
        y_center = (min_y + max_y)/2

        w = max_x - min_x
        h = max_y - min_y

        class_id = 0

        x_center_n = x_center/img_w
        y_center_n = y_center/img_h

        w_n = w/img_w
        h_n = h/img_h

        yolo_boxes.append(f"{class_id} {x_center_n} {y_center_n} {w_n} {h_n}")

    return yolo_boxes

In [None]:
def convert_data(data, d_type="train"):
    for idx, example in tqdm(enumerate(data), total=len(data)):
        img = example['image']
        img.save(os.path.join(DATA_DIR, d_type, "images", f"{idx}.{img.format.lower()}"))

        raw_data = example['raw_data']
        b_boxes = get_ocr_boxes(raw_data)

        yolo_boxes = convert_to_yolo(b_boxes, img.width, img.height)
        txt_data = "\n".join(yolo_boxes)

        yolo_txt_file = os.path.join(DATA_DIR, d_type, "labels", f"{idx}.txt")
        with open(yolo_txt_file, "w") as f:
            f.write(txt_data)

        txt_labels = get_txt_labels(b_boxes)
        labels_txt_file = os.path.join(DATA_DIR, d_type, "txt_labels", f"{idx}.json")
        with open(labels_txt_file, "w") as f:
            f.write(str(txt_labels))

In [None]:
convert_data(dataset['valid'], "valid")

  0%|          | 0/70 [00:00<?, ?it/s]

In [None]:
convert_data(dataset['train'], "train")

  0%|          | 0/2043 [00:00<?, ?it/s]

In [None]:
convert_data(dataset['test'], "test")

  0%|          | 0/125 [00:00<?, ?it/s]