In [1]:
import pathlib

# ⑆ (transit: used to delimit a bank code);
# ⑈ (on-us: used to delimit a customer account number);
# ⑇ (amount: used to delimit a transaction amount);
# ⑉ (dash: used to delimit parts of numbers—e.g., routing numbers or account numbers).
char_map = {"A": "⑆", "B": "⑇", "C": "⑈", "D": "⑉"}

# convert dataset to EasyOCR format

images = sorted(pathlib.Path("e13b").rglob("*.tif"), key=lambda x: x.stem)
labels = sorted(pathlib.Path("e13b").rglob("*.gt.txt"), key=lambda x: x.stem)

True

In [8]:
import pandas as pd

data = []
for image_path, label_path in zip(images, labels):
    label = label_path.read_text().strip()
    for key, value in char_map.items():
        label = label.replace(key, value)
    data.append({
        "filename": image_path.name,
        "words": label
    })

df = pd.DataFrame(data)
# replace \s+ with ' ' in `words` column
df["words"] = df["words"].str.replace(r"\s+", " ", regex=True)
df.to_csv("data/labels.csv", index=False)

In [11]:
# convert all tifs to jpgs
import cv2
import numpy as np

for image_path in images:
    image = cv2.imread(str(image_path), cv2.IMREAD_UNCHANGED)
    cv2.imwrite(f"o/data/{image_path.stem}.jpg", image, [int(cv2.IMWRITE_JPEG_QUALITY), 100])

In [20]:
import os
import shutil
df = pd.read_csv("data/labels.csv")

# split the dataset into train, val, and test
train_data = df.sample(frac=0.8, random_state=42)
df = df.drop(train_data.index)
val_data = df.sample(frac=0.5, random_state=42)
test_data = df.drop(val_data.index)

# save the new datasets and the images to output/
os.makedirs("output/train", exist_ok=True)
os.makedirs("output/val", exist_ok=True)
os.makedirs("output/test", exist_ok=True)
train_data.to_csv("output/train/labels.csv", index=False)
val_data.to_csv("output/val/labels.csv", index=False)
test_data.to_csv("output/test/labels.csv", index=False)
for row in train_data.itertuples():
    shutil.copy(f"data/{row.filename}", f"output/train/{row.filename}")
for row in val_data.itertuples():
    shutil.copy(f"data/{row.filename}", f"output/val/{row.filename}")
for row in test_data.itertuples():
    shutil.copy(f"data/{row.filename}", f"output/test/{row.filename}")
