In [1]:
!sudo apt install tesseract-ocr -y
!pip install pytesseract Pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [3]:
from google.colab import files
uploaded = files.upload()

Saving test.zip to test.zip
Saving train.zip to train.zip
Saving valid.zip to valid.zip


In [4]:
import zipfile

for zip_file in uploaded.keys():
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(f"/content/{zip_file[:-4]}")


In [5]:
!apt install tesseract-ocr
!pip install pytesseract jiwer opencv-python

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.13.0


In [None]:
# evaluate
# WER - incorrect words
# CER -incorrect characters

In [6]:
import pytesseract
from jiwer import wer, cer
import os

def evaluate_ocr_text_metrics(folder_path):
    image_folder = os.path.join(folder_path, "images")
    label_folder = os.path.join(folder_path, "labels")
    image_extensions = ['.png', '.jpg', '.jpeg']
    total_wer = []
    total_cer = []

    for file in os.listdir(image_folder):
        if any(file.endswith(ext) for ext in image_extensions):
            image_path = os.path.join(image_folder, file)
            label_name = os.path.splitext(file)[0] + ".txt"
            text_path = os.path.join(label_folder, label_name)

            if not os.path.exists(text_path):
                print(f"Missing label for {file}, skipping...")
                continue

            # Read ground truth
            with open(text_path, 'r') as f:
                gt_text = f.read().strip()

            # OCR prediction
            pred_text = pytesseract.image_to_string(image_path).strip()

            # WER & CER
            total_wer.append(wer(gt_text, pred_text))
            total_cer.append(cer(gt_text, pred_text))

    if total_wer and total_cer:
        avg_wer = sum(total_wer) / len(total_wer)
        avg_cer = sum(total_cer) / len(total_cer)

        print(f"Average WER: {avg_wer:.4f}")
        print(f"Average CER: {avg_cer:.4f}")
    else:
        print("No valid image-label pairs found.")


In [8]:
print("=== TRAINING SET ===")
evaluate_ocr_text_metrics("/content/train/train")

print("\n=== VALIDATION SET ===")
evaluate_ocr_text_metrics("/content/valid/valid")

print("\n=== TEST SET ===")
evaluate_ocr_text_metrics("/content/test/test")

=== TRAINING SET ===
Average WER: 3.9292
Average CER: 6.9174
=== VALIDATION SET ===
Average WER: 2.3216
Average CER: 1.1564

=== TEST SET ===
Average WER: 3.6410
Average CER: 1.6765


In [None]:
# Evaluate box accuracy

In [9]:
import cv2
import pytesseract
import os

def get_boxes(image_path):
    img = cv2.imread(image_path)
    h, w, _ = img.shape
    boxes = []

    data = pytesseract.image_to_boxes(img)
    for line in data.splitlines():
        parts = line.split()
        if len(parts) == 6:
            char, x1, y1, x2, y2, _ = parts
            x1, y1, x2, y2 = int(x1), h - int(y2), int(x2), h - int(y1)
            boxes.append((x1, y1, x2, y2))
    return boxes

def compute_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])

    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def evaluate_boxes(folder_path):
    image_folder = os.path.join(folder_path, "images")
    label_folder = os.path.join(folder_path, "labels")
    image_extensions = ['.png', '.jpg', '.jpeg']
    total_iou = []

    for file in os.listdir(image_folder):
        if any(file.endswith(ext) for ext in image_extensions):
            image_path = os.path.join(image_folder, file)
            box_file_name = os.path.splitext(file)[0] + ".box"
            gt_box_path = os.path.join(label_folder, box_file_name)

            if not os.path.exists(gt_box_path):
                print(f"Missing box label for {file}, skipping...")
                continue

            pred_boxes = get_boxes(image_path)

            with open(gt_box_path, 'r') as f:
                gt_boxes = []
                h, w, _ = cv2.imread(image_path).shape
                for line in f:
                    parts = line.split()
                    if len(parts) >= 5:
                        _, x1, y1, x2, y2 = parts[:5]
                        x1, y1, x2, y2 = int(x1), h - int(y2), int(x2), h - int(y1)
                        gt_boxes.append((x1, y1, x2, y2))

            min_len = min(len(pred_boxes), len(gt_boxes))
            for i in range(min_len):
                iou = compute_iou(pred_boxes[i], gt_boxes[i])
                total_iou.append(iou)

    if total_iou:
        avg_iou = sum(total_iou) / len(total_iou)
        print(f"Average IoU (Box Accuracy): {avg_iou:.4f}")
    else:
        print("No valid image-box pairs found.")


In [10]:
print("=== TRAINING SET ===")
evaluate_boxes("/content/train/train")

print("\n=== VALIDATION SET ===")
evaluate_boxes("/content/valid/valid")

print("\n=== TEST SET ===")
evaluate_boxes("/content/test/test")

=== TRAINING SET ===
Missing box label for receipt_image_49_jpg.rf.59378b405058fd0ee8fe732122557211.jpg, skipping...
Missing box label for 1030-receipt_jpg.rf.affd2e8c410a390cb0b5d51494c225ff.jpg, skipping...
Missing box label for photo_2024-08-30-10-08-21_jpeg.rf.bb5e55b5d43a177f5a5a997ba5ce6cc4.jpg, skipping...
Missing box label for 1006-receipt_jpg.rf.5bb104cb36b4c638853a17e7e1cf58c6.jpg, skipping...
Missing box label for 1005-receipt_jpg.rf.adc7cdeb6cb2211cae2bde26807226ec.jpg, skipping...
Missing box label for receipt_image_630_jpg.rf.a1547656be22b7e6322b6391b8ba13d4.jpg, skipping...
Missing box label for 148_jpg.rf.51051088185a9408a8146dc325379b4d.jpg, skipping...
Missing box label for 1022-receipt_jpg.rf.996b4648103abe9a7688be20fb3e7ccd.jpg, skipping...
Missing box label for receipt_image_379_jpg.rf.7b69c890b8cf7b9be24a9e762bdac6d1.jpg, skipping...
Missing box label for 107_jpg.rf.aaa61206fb42afbc9d6000e4fe1c2c67.jpg, skipping...
Missing box label for receipt_image_616_jpg.rf.b4