In [3]:
# Use EasyOCR to perform the OCR tasks
# Used to evaluate the effectiveness of my OCR model

# !pip install easyocr
import easyocr
import os 
from pathlib import Path

reader = easyocr.Reader(['en'])

In [1]:
# Read the documents with easyocr
# calculates accuracy with easyocr modified
def calculate_accuracy(model, documents_dir: str, results_dir: str): 
    """To calculate accuracy for the OCR model we have a set of labeled documents in the documents dir 
       and a set of labeled result translations (human labeled). The accuracy metric will calculate the perctange 
       of letters similar between the expected OCR and the actual result.
       
       Difference is calculated with a range of 0-1 with a difference of 0 meaning the documents are similar and a difference of 1 
       meaning that the documents are very different
    Args:
        model (_type_): the model used to perform the ocr
        documents_dir (str): the documents directory where to perform OCR
        results_dir (str): the results directory where to compare results with
    """
    for fname in os.listdir(documents_dir): 
        cur_path = os.path.join(documents_dir, fname)
        difference = 0
        
        if not os.path.isfile(cur_path) or Path(cur_path).suffix not in [".png", ".jpeg", ".jpg", ".webp"]: 
            continue
        
        results = model.readtext(cur_path)
        results = [result[1] for result in results]  # get only the text detection 
        
        # Calculate expected count
        expected_count = {}
        for s in results:
            for c in list(s):
                if c not in expected_count:
                    expected_count[c] = 1
                else:
                    expected_count[c] += 1
                
        # Calculate actual count
        if Path(fname).with_suffix('.txt').name in os.listdir(results_dir):
            actual_fpath = os.path.join(results_dir, Path(fname).with_suffix('.txt'))
        else:
            continue
        # Calculate actual count
        with open(actual_fpath, 'r') as f:
            actual_txt = f.read()
            
        actual_count = {}
        total_chars = 0
        for c in actual_txt:
            if c not in actual_count:
                actual_count[c] = 1
            else:
                actual_count[c] += 1
            total_chars += 1
            
        # Now compare similarity between the two
        for key, value in actual_count.items(): 
            if key in expected_count: 
                actual_count[key] -= expected_count[key]
                
        for key, value in actual_count.items():
            if value > 0: 
                difference += value
        
        difference = difference / total_chars
        
        print(f'Difference between {cur_path} and ocr is: {difference}')

In [4]:
# Calculate the accuracy between the easy ocr and the actual docs
calculate_accuracy(reader, "documents", "results")

Difference between documents/doc3.webp and ocr is: 0.037037037037037035
Difference between documents/doc1.png and ocr is: 0.03724247226624406
Difference between documents/doc2.png and ocr is: 0.007042253521126761
