Importing the Dataset

In [8]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mayankverma341","key":"0725e8cbd3bd31dcd476f7a1a84d27bb"}'}

In [9]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [10]:
!kaggle datasets download -d senju14/ocr-dataset-of-multi-type-documents

Dataset URL: https://www.kaggle.com/datasets/senju14/ocr-dataset-of-multi-type-documents
License(s): MIT
Downloading ocr-dataset-of-multi-type-documents.zip to /content
100% 5.14G/5.16G [02:13<00:01, 13.6MB/s]
100% 5.16G/5.16G [02:13<00:00, 41.5MB/s]


In [11]:
import os
dataset = "ocr-dataset-of-multi-type-documents"
zip_file = f"{dataset}.zip"
destination_dir = f"/content/{dataset}"
!unzip -q $zip_file -d $destination_dir
!rm $zip_file

Installing & Importing Required Libraries

In [314]:
!pip install pytesseract
!pip install jiwer
!apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [288]:
import pytesseract
from jiwer import cer, wer
from tqdm import tqdm
import json
import cv2
import pandas as pd
import numpy as np

Function to Preprocess Images

In [341]:
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    img = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    kernel = np.array([[0, -1, 0],
                       [-1, 5,-1],
                       [0, -1, 0]])
    sharpened = cv2.filter2D(thresh, -1, kernel)

    return sharpened




In [343]:
dataset_path= "/content/ocr-dataset-of-multi-type-documents/invoice/test/images"
dataset_path_json= "/content/ocr-dataset-of-multi-type-documents/invoice/test/annotations"

files = [f for f in os.listdir(dataset_path)[:50]]

def get_ground_truth_from_json(json_file):

    try:
        with open(json_file, 'r') as f:
            data = json.load(f)

        if 'ocr_boxes' in data:
            text = [box["text"] for box in data["ocr_boxes"]]
            full_text = " ".join(text).strip()
            return full_text

        else:
            return ""
    except Exception as e:
        # print(f"Error reading JSON {json_file}: {e}")
        return ""

In [344]:
with open("/content/ocr-dataset-of-multi-type-documents/invoice/train/annotations/X00016469612.json", "r") as f:
  data = json.load(f)

print(data["file_id"])
entities = data["entities"]
print(entities["company"])

# for box in data["ocr_boxes"]:
# print("Text:", box["text"])
# print("Points:", box["points"])

text = [box["text"] for box in data["ocr_boxes"]]
print(" ".join(text).strip())

X00016469612
BOOK TA .K (TAMAN DAYA) SDN BHD
TAN WOON YANN BOOK TA .K(TAMAN DAYA) SDN BND 789417-W NO.53 55,57 & 59, JALAN SAGU 18, TAMAN DAYA, 81100 JOHOR BAHRU, JOHOR. DOCUMENT NO : TD01167104 DATE: 25/12/2018 8:13:39 PM CASHIER: MANIS MEMBER: CASH BILL CODE/DESC PRICE DISC AMOUNT QTY RM RM 9556939040116 KF MODELLING CLAY KIDDY FISH 1 PC * 9.000 0.00 9.00 TOTAL: ROUR DING ADJUSTMENT: 0.00 ROUND D TOTAL (RM): 9.00 CASH 10.00 CHANGE 1.00 GOODS SOLD ARE NOT RETURNABLE OR EXCHANGEABLE *** *** THANK YOU PLEASE COME AGAIN ! 9.00


In [345]:
psm = [3,4,6,11]

In [346]:
results = []

for filename in tqdm(files, desc="Processing Images"):
    image_path = os.path.join(dataset_path, filename)
    base_name = os.path.splitext(filename)[0]
    json_path = os.path.join(dataset_path_json, base_name + ".json")

    if not os.path.exists(json_path):
        continue

    ground_truth = get_ground_truth_from_json(json_path)

    if not ground_truth.strip():
        continue

    processed_img = preprocess_image(image_path)
    if processed_img is None:
        continue

    for psm_mode in psm:
      custom_config = f'--oem 3 --psm {psm_mode}'
      predicted_text = pytesseract.image_to_string(processed_img, config=custom_config)
      gt_clean = " ".join(ground_truth.split())
      pred_clean = " ".join(predicted_text.split())

      try:
        error_cer = cer(gt_clean.upper(), pred_clean.upper())
        error_wer = wer(gt_clean.upper(), pred_clean.upper())

      except Exception as e:
        # print(e)
        continue

      results.append({"file_name":filename,"psm_mode":psm_mode,"ocr_text":pred_clean.upper(),"ground_truth":gt_clean.upper(),"cer":error_cer,"wer":error_wer})

    df = pd.DataFrame(results)
    # df.head()

Processing Images: 100%|██████████| 50/50 [20:21<00:00, 24.43s/it]


In [352]:
df_psm3 = df[df["psm_mode"] == 3]
df_psm4 = df[df["psm_mode"] == 4]
df_psm6 = df[df["psm_mode"] == 6]
df_psm11 = df[df["psm_mode"] == 11]

for psm_mode in psm:
  x = df[df["psm_mode"]==psm_mode]
  avg_cer = x["cer"].mean()
  avg_wer = x["wer"].mean()
  count = x.shape[0]
  print("\n" + "="*30)
  print(f"Evaluation Complete on {count} images,PSM = {psm_mode}")
  print(f"Average CER: {avg_cer:.4f}")
  print(f"Average WER: {avg_wer:.4f}")
  print("="*30)


Evaluation Complete on 50 images,PSM = 3
Average CER: 0.3208
Average WER: 0.6565

Evaluation Complete on 50 images,PSM = 4
Average CER: 0.3159
Average WER: 0.6404

Evaluation Complete on 50 images,PSM = 6
Average CER: 0.2662
Average WER: 0.5924

Evaluation Complete on 50 images,PSM = 11
Average CER: 0.4807
Average WER: 0.9766


In [353]:
df_psm3.head()

Unnamed: 0,file_name,psm_mode,ocr_text,ground_truth,cer,wer
0,X51006619570.jpg,3,MR. D.I.Y. (4) SDN BHD CO-REG: 860671-D LOT 18...,MR. D.I.Y. (M) SDN BHD CO-REG:860671-D LOT 185...,0.137226,0.364407
4,X51006619507.jpg,3,‘ O GOLDEN ARCHES RESTAURANTS SDN BHD (6535 1-...,GOLDEN ARCHES RESTAURANTS SDN BHD (65351-M) LE...,0.159091,0.580247
8,X51007846379.jpg,3,OV X ) AO6HE! AE A AEON CO. (M) BHD (126926-H)...,"AEON CO. (M) BHD (126926-H) 3RD FLR, AEON TAMA...",0.551224,0.971264
12,X51009568881.jpg,3,HON HWA HARDWARE TRADING COMPANY REG. NO. : 00...,HON HWA HARDWARE TRADING COMPANY REG. NO. : 00...,0.061157,0.252252
16,X51005741944.jpg,3,EEE E ' TAX INVOICE BHPETROL SUBANG AIRPORT JA...,TAX INVOICE BHPETROL SUBANG AIRPORT JALAN LAPA...,0.246617,0.64
