In [3]:
!apt-get install -y tesseract-ocr
!pip install pytesseract paddleocr easyocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting paddleocr
  Using cached paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting easyocr
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pyclipper (from paddleocr)
  Using cached pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Using cached lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any

In [6]:
import time
from PIL import Image
import pytesseract
import numpy as np

# Set the image path
IMAGE_PATH = '/content/test.jpg'

# Import OCR libraries with error handling
try:
    from paddleocr import PaddleOCR
    paddle_available = True
except ImportError:
    print("PaddleOCR import failed. It will be skipped in the benchmark.")
    paddle_available = False

try:
    import easyocr
    easyocr_available = True
except ImportError:
    print("EasyOCR import failed. It will be skipped in the benchmark.")
    easyocr_available = False

# Benchmarking functions
def benchmark_tesseract(image):
    start_time = time.time()
    text = pytesseract.image_to_string(image, lang='eng')
    elapsed_time = time.time() - start_time
    return text, elapsed_time

def benchmark_paddleocr(image):
    if not paddle_available:
        return "PaddleOCR not available", 0
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    start_time = time.time()
    result = ocr.ocr(np.array(image), cls=True)
    elapsed_time = time.time() - start_time
    text = " ".join([line[1][0] for line in result])
    return text, elapsed_time

def benchmark_easyocr(image):
    if not easyocr_available:
        return "EasyOCR not available", 0
    reader = easyocr.Reader(['en'])
    start_time = time.time()
    result = reader.readtext(np.array(image))
    elapsed_time = time.time() - start_time
    text = " ".join([item[1] for item in result])
    return text, elapsed_time

# Main benchmarking function
def benchmark_all_models(image_path):
    image = Image.open(image_path)
    results = {}

    print("Running Tesseract OCR...")
    results['Tesseract'] = benchmark_tesseract(image)

    print("Running PaddleOCR...")
    results['PaddleOCR'] = benchmark_paddleocr(image)

    print("Running EasyOCR...")
    results['EasyOCR'] = benchmark_easyocr(image)

    return results

# Print results
def print_results(results):
    print("\n=== Summary of Results ===")
    for model_name, (text, exec_time) in results.items():
        print(f"\n{model_name}:")
        print(f"Time: {exec_time:.2f} sec")
        print(f"Extracted Text (first 200 chars): {text[:200]}...")

# Main execution
print(f"Processing image: {IMAGE_PATH}")
results = benchmark_all_models(IMAGE_PATH)
print_results(results)

PaddleOCR import failed. It will be skipped in the benchmark.
Processing image: /content/test.jpg
Running Tesseract OCR...




Running PaddleOCR...
Running EasyOCR...
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete
=== Summary of Results ===

Tesseract:
Time: 9.90 sec
Extracted Text (first 200 chars): DIRECTION

[Taam AMA ]

1. Ashok went 8 Kms. South and turned west and

walked 3 Kms, again he turned north and walked 5
Kins. He took a final Turn to east and walked 3
Kins. In which direction was As...

PaddleOCR:
Time: 0.00 sec
Extracted Text (first 200 chars): PaddleOCR not available...

EasyOCR:
Time: 6.60 sec
Extracted Text (first 200 chars): DIRECTION FTT ] 1_ Ashok went & Kms. South and turned west and (a) South-West / #fzur_7fr45 walked 3 Kms; = he turned north ad walked 5 (b) South-East afau_Ya Kms. He took a final Turn to east ad walk...


In [10]:
import time
import subprocess
import sys
import os
from PIL import Image
import pytesseract
import numpy as np

# Set the image path
IMAGE_PATH = '/content/test.jpg'

# Function to install Hindi language pack for Tesseract
def install_hindi_tesseract():
    print("Attempting to install Hindi language pack for Tesseract...")
    try:
        subprocess.run(["sudo", "apt-get", "update"], check=True)
        subprocess.run(["sudo", "apt-get", "install", "-y", "tesseract-ocr-hin"], check=True)
        print("Hindi language pack installed successfully.")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to install Hindi language pack: {e}")
        return False

# Import OCR libraries with error handling
try:
    from paddleocr import PaddleOCR
    paddle_available = True
except ImportError:
    print("PaddleOCR import failed. It will be skipped in the benchmark.")
    paddle_available = False

try:
    import easyocr
    easyocr_available = True
except ImportError:
    print("EasyOCR import failed. It will be skipped in the benchmark.")
    easyocr_available = False

# Benchmarking functions
def benchmark_tesseract(image):
    start_time = time.time()
    text = ""
    try:
        text_eng = pytesseract.image_to_string(image, lang='eng')
        text += f"English:\n{text_eng}\n\n"
    except pytesseract.TesseractError:
        text += "English: Failed to process\n\n"

    try:
        text_hin = pytesseract.image_to_string(image, lang='hin')
        text += f"Hindi:\n{text_hin}"
    except pytesseract.TesseractError:
        print("Hindi language pack not found. Attempting to install...")
        if install_hindi_tesseract():
            try:
                text_hin = pytesseract.image_to_string(image, lang='hin')
                text += f"Hindi:\n{text_hin}"
            except pytesseract.TesseractError:
                text += "Hindi: Failed to process even after installation attempt"
        else:
            text += "Hindi: Failed to install language pack"

    elapsed_time = time.time() - start_time
    return text, elapsed_time

def benchmark_paddleocr(image):
    if not paddle_available:
        return "PaddleOCR not available", 0
    ocr_eng = PaddleOCR(use_angle_cls=True, lang='en')
    ocr_hin = PaddleOCR(use_angle_cls=True, lang='hi')
    start_time = time.time()
    result_eng = ocr_eng.ocr(np.array(image), cls=True)
    result_hin = ocr_hin.ocr(np.array(image), cls=True)
    elapsed_time = time.time() - start_time
    text_eng = " ".join([line[1][0] for line in result_eng])
    text_hin = " ".join([line[1][0] for line in result_hin])
    return f"English:\n{text_eng}\n\nHindi:\n{text_hin}", elapsed_time

def benchmark_easyocr(image):
    if not easyocr_available:
        return "EasyOCR not available", 0
    reader = easyocr.Reader(['en', 'hi'])
    start_time = time.time()
    result = reader.readtext(np.array(image))
    elapsed_time = time.time() - start_time
    text = "\n".join([f"{item[1]} ({item[2]})" for item in result])
    return text, elapsed_time

# Main benchmarking function
def benchmark_all_models(image_path):
    image = Image.open(image_path)
    results = {}

    print("Running Tesseract OCR...")
    results['Tesseract'] = benchmark_tesseract(image)

    if paddle_available:
        print("Running PaddleOCR...")
        results['PaddleOCR'] = benchmark_paddleocr(image)
    else:
        results['PaddleOCR'] = ("PaddleOCR not available", 0)

    if easyocr_available:
        print("Running EasyOCR...")
        results['EasyOCR'] = benchmark_easyocr(image)
    else:
        results['EasyOCR'] = ("EasyOCR not available", 0)

    return results

# Print results
def print_results(results):
    print("\n=== Summary of Results ===")
    for model_name, (text, exec_time) in results.items():
        print(f"\n{model_name}:")
        print(f"Time: {exec_time:.2f} sec")
        print("Extracted Text:")
        print("-" * 40)
        print(text)
        print("-" * 40)

# Check if script is run with sudo
if not os.geteuid() == 0:
    print("This script requires sudo privileges to install packages.")
    print("Please run the script with sudo.")
    sys.exit(1)

# Main execution
print(f"Processing image: {IMAGE_PATH}")
results = benchmark_all_models(IMAGE_PATH)
print_results(results)

PaddleOCR import failed. It will be skipped in the benchmark.
Processing image: /content/test.jpg
Running Tesseract OCR...
Hindi language pack not found. Attempting to install...
Attempting to install Hindi language pack for Tesseract...
Hindi language pack installed successfully.




Running EasyOCR...
Progress: |██████████████████████████████████████████████████| 100.0% Complete
=== Summary of Results ===

Tesseract:
Time: 37.56 sec
Extracted Text:
----------------------------------------
English:
DIRECTION

[Taam AMA ]

1. Ashok went 8 Kms. South and turned west and

walked 3 Kms, again he turned north and walked 5
Kins. He took a final Turn to east and walked 3
Kins. In which direction was Ashok from the start-
ing point ?
aie 8 frat, <faot pt ae Te aR USA al
WsHt 3 fat. wel ae fee See Ht IH AST SR
5 fet. sen sta Fae ye fee al AR AST
3 feat. Aen! seem sinter wad G fre fez
a?
(a) East / Ta (b) North / 33%
(c) West / Tia (d) South / =f&rr
» Starting from a point P, sachin walked 20 m to-
wards South he turned left and walked 30 m. he
then turned left and walked 20 m. he again turned.
left and walked 40 m and reached a Point Q. How
far and in which direction is the point P from the
point Q.?
faut fry PO YE Hed El BP 20 Alex <fart
H A GA H WS We AR YSHL 30 Alet ACT 