In [2]:
import os
import cv2
from PIL import Image
import matplotlib.pyplot as plt

def read_images(input_folder):
    """
    Reads all images from the input folder.

    Args:
    input_folder (str): Path to the input folder containing images.

    Returns:
    List of image file paths.
    """
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_files.append(os.path.join(input_folder, file))
    return image_files

def draw_boxes(image, boxes, color=(0, 255, 0)):
    for box in boxes:
        if len(box) == 4:  # Ensuring box has the correct format
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2)
    return image

def save_image(output_folder, filename, image):
    os.makedirs(output_folder, exist_ok=True)
    Image.fromarray(image).save(os.path.join(output_folder, filename + '_highlighted.png'))


In [7]:
!unzip /content/In.zip

Archive:  /content/In.zip
  inflating: In/1.jpg                
  inflating: In/10.jpg               
  inflating: In/11.jpg               
  inflating: In/12.png               
  inflating: In/2.jpg                
  inflating: In/3.jpg                
  inflating: In/4.jpg                
  inflating: In/5.jpg                
  inflating: In/6.jpg                
  inflating: In/7.jpg                
  inflating: In/8.jpg                
  inflating: In/9.jpg                


## easyOCR

In [9]:

#!pip install python-bidi==0.4.2
#!pip install easyocr

import easyocr
def easyocr_ocr(input_folder, output_folder):
    """
    Uses EasyOCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    reader = easyocr.Reader(['en'])
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        easyocr_results = reader.readtext(image_rgb)
        easyocr_boxes = []
        for result in easyocr_results:
            box = result[0]
            easyocr_boxes.append([int(min(point[0] for point in box)), int(min(point[1] for point in box)),
                                  int(max(point[0] for point in box)), int(max(point[1] for point in box))])

        output_image = draw_boxes(image_rgb.copy(), easyocr_boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

easyocr_ocr('/content/In', '/content/easyOCR')


## tesseract_ocr library

In [12]:
#!apt-get install tesseract-ocr
#!pip install pytesseract

import pytesseract

def tesseract_ocr(input_folder, output_folder):
    """
    Uses Tesseract OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        tesseract_data = pytesseract.image_to_boxes(image).splitlines()
        tesseract_boxes = []
        for line in tesseract_data:
            parts = line.split(' ')
            if len(parts) >= 6:
                x1, y1, x2, y2 = map(int, parts[1:5])
                tesseract_boxes.append((x1, image.shape[0] - y2, x2, image.shape[0] - y1))

        output_image = draw_boxes(image_rgb.copy(), tesseract_boxes, color=(255, 0, 0))  # Red
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

tesseract_ocr('/content/In', 'pytesseract')


## PaddleOCR

In [16]:
#!pip3 install paddlepaddle paddleocr
from paddleocr import PaddleOCR

def paddleocr_ocr(input_folder, output_folder):
    """
    Uses PaddleOCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        paddleocr_results = ocr.ocr(image_file, cls=True)
        paddleocr_boxes = []
        for result in paddleocr_results:
            for line in result:
                box = line[0]
                paddleocr_boxes.append([int(min(point[0] for point in box)), int(min(point[1] for point in box)),
                                        int(max(point[0] for point in box)), int(max(point[1] for point in box))])

        output_image = draw_boxes(image_rgb.copy(), paddleocr_boxes, color=(0, 0, 255))  # Blue
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

paddleocr_ocr('/content/In', 'PaddleOCR')


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:13<00:00, 291kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:10<00:00, 992kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:12<00:00, 178kiB/s]

[2024/07/22 07:39:47] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




[2024/07/22 07:39:48] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.32001304626464844
[2024/07/22 07:39:48] ppocr DEBUG: cls num  : 8, elapsed : 0.06385326385498047
[2024/07/22 07:39:49] ppocr DEBUG: rec_res num  : 8, elapsed : 0.40788793563842773
[2024/07/22 07:39:49] ppocr DEBUG: dt_boxes num : 28, elapsed : 0.12374424934387207
[2024/07/22 07:39:49] ppocr DEBUG: cls num  : 28, elapsed : 0.07362890243530273
[2024/07/22 07:39:50] ppocr DEBUG: rec_res num  : 28, elapsed : 1.0347530841827393
[2024/07/22 07:39:50] ppocr DEBUG: dt_boxes num : 30, elapsed : 0.2462143898010254
[2024/07/22 07:39:50] ppocr DEBUG: cls num  : 30, elapsed : 0.08752655982971191
[2024/07/22 07:39:52] ppocr DEBUG: rec_res num  : 30, elapsed : 1.578695297241211
[2024/07/22 07:39:53] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.42932844161987305
[2024/07/22 07:39:53] ppocr DEBUG: cls num  : 6, elapsed : 0.022837162017822266
[2024/07/22 07:39:54] ppocr DEBUG: rec_res num  : 6, elapsed : 1.2392323017120361
[2024/07/22 07:39:

## doctr library

In [None]:
import os
import cv2
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

def read_images(input_folder):
    """
    Reads image files from the specified input folder.
    Args:
    input_folder (str): Path to the folder containing the input images.
    Returns:
    list: A list of file paths for the images in the input folder.
    """
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):  # Ensure we read only image files
            image_files.append(os.path.join(input_folder, file))
    return image_files

def save_image(output_folder, filename, image):
    """
    Saves the image to the specified output folder.
    Args:
    output_folder (str): Path to the folder where the image will be saved.
    filename (str): The name of the file to save the image as.
    image (numpy.ndarray): The image to save.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_path = os.path.join(output_folder, f"{filename}.png")
    cv2.imwrite(output_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

def perform_ocr_doctr(image_path):
    """
    Performs OCR on the image using the doctr library.
    Args:
    image_path (str): Path to the image file.
    Returns:
    Document: The OCR result as a doctr Document object.
    """
    model = ocr_predictor(pretrained=True)
    doc = DocumentFile.from_images(image_path)
    result = model(doc)
    return result

def draw_boxes(image_path, result):
    """
    Draws bounding boxes around the recognized text in the image.
    Args:
    image_path (str): Path to the image file.
    result (Document): The OCR result as a doctr Document object.
    Returns:
    numpy.ndarray: The image with bounding boxes drawn around recognized text.
    """
    image = cv2.imread(image_path)
    height, width, _ = image.shape
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    # Extract bounding box coordinates
                    (x_min, y_min), (x_max, y_max) = word.geometry
                    # Convert normalized coordinates to pixel coordinates
                    box = [
                        int(x_min * width), int(y_min * height),
                        int(x_max * width), int(y_max * height)
                    ]
                    # Draw rectangle on the image
                    cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
    return image

def extract_text(result):
    """
    Extracts text content from the OCR result.
    Args:
    result (Document): The OCR result as a doctr Document object.
    Returns:
    str: The extracted text content.
    """
    text_content = ""
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    text_content += word.value + " "
                text_content += "\n"
            text_content += "\n"
        text_content += "\n"
    return text_content

def apply_doctr_model(input_folder, output_folder):
    """
    Applies the doctr OCR model to images in the input folder and saves the results.
    Args:
    input_folder (str): Path to the folder containing the input images.
    output_folder (str): Path to the folder where the results will be saved.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        result = perform_ocr_doctr(image_file)

        # Draw bounding boxes on the image
        annotated_image = draw_boxes(image_file, result)

        # Save the annotated image
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, base_name, annotated_image)

        # Extract and save text content to a text file
        text_content = extract_text(result)
        text_file = os.path.join(output_folder, f"{base_name}.txt")
        with open(text_file, 'w') as f:
            f.write(text_content)

# Paths
input_folder = '/content/In'
output_folder = '/content/OCR_Doctr'

# Apply the OCR model
apply_doctr_model(input_folder, output_folder)


In [None]:
import pytesseract

def preprocess_and_tesseract_ocr(input_folder, output_folder):
    """
    Preprocess images using OpenCV and uses Tesseract OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blur = cv2.GaussianBlur(gray, (5, 5), 0)
        edged = cv2.Canny(blur, 50, 150)

        tesseract_data = pytesseract.image_to_boxes(edged).splitlines()
        tesseract_boxes = []
        for line in tesseract_data:
            parts = line.split(' ')
            if len(parts) >= 6:
                x1, y1, x2, y2 = map(int, parts[1:5])
                tesseract_boxes.append((x1, image.shape[0] - y2, x2, image.shape[0] - y1))

        output_image = draw_boxes(cv2.cvtColor(edged, cv2.COLOR_GRAY2RGB), tesseract_boxes, color=(255, 0, 0))  # Red
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

preprocess_and_tesseract_ocr('/content/In', '/content/pytesseract_opncv')


## keras_ocr

In [None]:
#!pip install keras_ocr

import keras_ocr
def keras_ocr_function(input_folder, output_folder):
    """
    Uses Keras-OCR to read text from images in the input folder and save highlighted images in the output folder.
    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    pipeline = keras_ocr.pipeline.Pipeline()
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = keras_ocr.tools.read(image_file)
        predictions = pipeline.recognize([image])[0]
        boxes = []
        for prediction in predictions:
            box = prediction[1]
            x1, y1 = int(box[0][0]), int(box[0][1])
            x2, y2 = int(box[2][0]), int(box[2][1])
            boxes.append([x1, y1, x2, y2])

        output_image = draw_boxes(image.copy(), boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

keras_ocr_function('/content/In', 'Keras_ocr')


### 👇 failure tries part of the codes 👇

## google_vision_ocr -> non open source library (not working)

In [27]:
from google.cloud import vision
import io

def google_vision_ocr(input_folder, output_folder):
    """
    Uses Google Cloud Vision OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    client = vision.ImageAnnotatorClient()
    image_files = read_images(input_folder)
    for image_file in image_files:
        with io.open(image_file, 'rb') as image_file_obj:
            content = image_file_obj.read()
        image = vision.Image(content=content)
        response = client.text_detection(image=image)
        texts = response.text_annotations
        google_boxes = []
        for text in texts:
            vertices = text.bounding_poly.vertices
            google_boxes.append([vertices[0].x, vertices[0].y, vertices[2].x, vertices[2].y])

        image_cv2 = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB)
        output_image = draw_boxes(image_rgb.copy(), google_boxes, color=(0, 255, 255))  # Yellow
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

google_vision_ocr('/content/In', 'Google_Vision')


ImportError: cannot import name 'vision' from 'google.cloud' (unknown location)

## aws_textract_ocr -> non open source library (not working)

In [26]:
#!pip install boto3

import boto3
from botocore.exceptions import NoRegionError
def aws_textract_ocr(input_folder, output_folder):
    """
    Uses AWS Textract OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    try:
        client = boto3.client('textract', region_name='your-region')  # specify your region
        image_files = read_images(input_folder)
        for image_file in image_files:
            with open(image_file, 'rb') as document:
                image_bytes = bytearray(document.read())
            response = client.detect_document_text(Document={'Bytes': image_bytes})

            aws_boxes = []
            for item in response['Blocks']:
                if item['BlockType'] == 'WORD':
                    box = item['Geometry']['BoundingBox']
                    width, height = Image.open(image_file).size
                    x1 = int(box['Left'] * width)
                    y1 = int(box['Top'] * height)
                    x2 = int((box['Left'] + box['Width']) * width)
                    y2 = int((box['Top'] + box['Height']) * height)
                    aws_boxes.append([x1, y1, x2, y2])

            image_cv2 = cv2.imread(image_file)
            image_rgb = cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB)
            output_image = draw_boxes(image_rgb.copy(), aws_boxes, color=(0, 128, 128))  # Teal
            filename = os.path.splitext(os.path.basename(image_file))[0]
            save_image(output_folder, filename, output_image)

    except NoRegionError:
        print("You must specify a region.")
    except Exception as e:
        print(f"An error occurred: {e}")

aws_textract_ocr('/content/In', 'AWS_Textract')


An error occurred: Unable to locate credentials


##  azure_ocr -> non open source library (not working)

In [None]:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
import io

def azure_ocr(input_folder, output_folder):
    """
    Uses Microsoft Azure OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    subscription_key = "your_subscription_key"
    endpoint = "your_endpoint"
    client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
    image_files = read_images(input_folder)
    for image_file in image_files:
        with io.open(image_file, 'rb') as image_file_obj:
            image_data = image_file_obj.read()
        response = client.read_in_stream(io.BytesIO(image_data), raw=True)
        operation_location = response.headers['Operation-Location']
        operation_id = operation_location.split('/')[-1]

        result = client.get_read_result(operation_id)
        azure_boxes = []
        if result.status == 'succeeded':
            for page in result.analyze_result.read_results:
                for line in page.lines:
                    bounding_box = line.bounding_box
                    x1, y1, x2, y2, x3, y3, x4, y4 = bounding_box
                    azure_boxes.append([int(x1), int(y1), int(x3), int(y3)])

        image_cv2 = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB)
        output_image = draw_boxes(image_rgb.copy(), azure_boxes, color=(255, 128, 0))  # Orange
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

azure_ocr('/content/In', 'path/to/output_folder')


## calamari_ocr -> open source but is not working

In [21]:
#!pip install calamari_ocr
from calamari_ocr.ocr import Predictor
import cv2

def calamari_ocr(input_folder, output_folder):
    """
    Uses Calamari OCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    predictor = Predictor.load('/content/en-default.pyrnn')  # Replace with your model path
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        result = predictor.predict(image_rgb)
        boxes = []
        for line in result:
            box = line['bbox']
            x1, y1 = int(box[0]), int(box[1])
            x2, y2 = int(box[2]), int(box[3])
            boxes.append([x1, y1, x2, y2])

        output_image = draw_boxes(image_rgb.copy(), boxes, color=(255, 0, 255))  # Magenta
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

calamari_ocr('/content/In', 'CalamariOCR')


AttributeError: type object 'Predictor' has no attribute 'load'

## kraken library -> open source but is not working

In [28]:
!pip install kraken
import os
import cv2
from PIL import Image
import kraken
from kraken.lib import models, segmentation
from kraken.lib.util import pil2im, im2pil
from kraken.lib.vgsl import TorchVGSLModel

def read_images(input_folder):
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_files.append(os.path.join(input_folder, file))
    return image_files

def save_image(output_folder, filename, image):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_path = os.path.join(output_folder, f"{filename}.png")
    cv2.imwrite(output_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

def perform_ocr(image_path, model_path):
    image = Image.open(image_path).convert('L')
    model = TorchVGSLModel.load_model(model_path)
    line_regions = segmentation.extract_polygons(image, segmentation.segment(im=image, text_direction='horizontal-lr'))
    ocr_text = ""
    for line in line_regions:
        ocr_text += model.predict_string(im2pil(line)) + "\n"
    return ocr_text

def apply_ocropus_model(input_folder, output_folder, model_path):
    image_files = read_images(input_folder)
    for image_file in image_files:
        # Perform OCR
        ocr_text = perform_ocr(image_file, model_path)

        # Save OCR result to a text file
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        text_file = os.path.join(output_folder, f"{base_name}.txt")
        with open(text_file, 'w') as f:
            f.write(ocr_text)

        # Optionally, save highlighted images
        image = cv2.imread(image_file)
        highlighted_image = highlight_text_regions(image, ocr_text)
        save_image(output_folder, base_name, highlighted_image)

def highlight_text_regions(image, ocr_text):
    # Implement your own logic to highlight text regions
    # This is a placeholder function
    return image

# Paths
input_folder = '/path/to/In'
output_folder = '/path/to/OCRopus'
model_path = '/path/to/model.mlmodel'

# Apply the OCR model
apply_ocropus_model(input_folder, output_folder, model_path)


AttributeError: module 'kraken.serialization' has no attribute 'load_any'

In [19]:
#!sudo apt-get update
#!sudo apt-get install gocr

import subprocess

def gocr_ocr(input_folder, output_folder):
    """
    Uses GOCR to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        temp_image_path = 'temp_gocr.pnm'
        cv2.imwrite(temp_image_path, image_gray)

        # Run GOCR
        result = subprocess.run(['gocr', '-i', temp_image_path, '-a', '50'], stdout=subprocess.PIPE)
        gocr_output = result.stdout.decode('utf-8')

        # Parse GOCR output to get bounding boxes (GOCR does not provide bounding boxes natively)
        gocr_boxes = []  # Implement a way to parse and convert text to bounding boxes if possible

        output_image = draw_boxes(image.copy(), gocr_boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

gocr_ocr('/content/In', 'gocr_ocr')


In [17]:
#!sudo apt-get install ocrad

def ocrad_ocr(input_folder, output_folder):
    """
    Uses Ocrad to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        temp_image_path = 'temp_ocrad.pnm'
        image = cv2.imread(image_file)
        image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        cv2.imwrite(temp_image_path, image_gray)

        # Run Ocrad
        result = subprocess.run(['ocrad', temp_image_path], stdout=subprocess.PIPE)
        ocrad_output = result.stdout.decode('utf-8')

        # Parse Ocrad output to get bounding boxes (Ocrad does not provide bounding boxes natively)
        ocrad_boxes = []  # Implement a way to parse and convert text to bounding boxes if possible

        output_image = draw_boxes(image.copy(), ocrad_boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

ocrad_ocr('/content/In', 'ocrad_ocr')


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb1 in position 1: invalid start byte

## kraken OCR -> open source but is not working

In [26]:
#!pip install kraken
from PIL import Image
import os
input_folder = '/content/In'
output_folder = '/content/kraken-ocr'
from kraken import binarization, rpred, pageseg, serialization


if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for filename in os.listdir(input_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff')):
        img_path = os.path.join(input_folder, filename)
        img = Image.open(img_path)

        # Binarize image
        bin_img = binarization.nlbin(img)

        # Segment the page to get bounding boxes
        bounds = pageseg.segment(bin_img)

        # Load the Kraken model
        model = serialization.load_any('en-default')

        # Perform OCR
        ocr_records = rpred.rpred(network=model, im=bin_img, bounds=bounds)
        ocr_text = '\n'.join([r['text'] for r in ocr_records])

        # Save OCR text to a file
        txt_filename = os.path.splitext(filename)[0] + '.txt'
        txt_path = os.path.join(output_folder, txt_filename)
        with open(txt_path, 'w') as txt_file:
            txt_file.write(ocr_text)



AttributeError: module 'kraken.serialization' has no attribute 'load_any'

## cuneiform_ocr -> open source but its not working

In [14]:
# !sudo add-apt-repository ppa:alex-p/cuneiform
# !sudo apt-get update
# !sudo apt-get install cuneiform
import subprocess

def cuneiform_ocr(input_folder, output_folder):
    """
    Uses CuneiForm to read text from images in the input folder and save highlighted images in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        temp_image_path = 'temp_cuneiform.bmp'
        image = cv2.imread(image_file)
        cv2.imwrite(temp_image_path, image)

        # Run CuneiForm
        subprocess.run(['cuneiform', '-f', 'hocr', '-o', 'temp_cuneiform.html', temp_image_path])

        # Parse HOCR file to get bounding boxes
        with open('temp_cuneiform.html', 'r') as file:
            hocr_data = file.read()

        cuneiform_boxes = []  # Implement a way to parse HOCR data to bounding boxes

        output_image = draw_boxes(image.copy(), cuneiform_boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)

cuneiform_ocr('/content/In', 'cuneiform_ocr')


## microsoft TrOCR ->suaible for handwritten -> open souce -> its output is undefined   

In [45]:
import os
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

def read_images(input_folder):
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_files.append(os.path.join(input_folder, file))
    return image_files

def perform_ocr_trocr(image_path):
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

def apply_trocr_model(input_folder, output_folder):
    image_files = read_images(input_folder)
    for image_file in image_files:
        ocr_text = perform_ocr_trocr(image_file)

        # Save OCR result to a text file
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        text_file = os.path.join(output_folder, f"{base_name}.txt")

        # Ensure output folder exists
        os.makedirs(output_folder, exist_ok=True)

        with open(text_file, 'w') as f:
            f.write(ocr_text)

# Paths
input_folder = '/content/In'
output_folder = '/content/OCR_TroCR'

# Apply the OCR model
apply_trocr_model(input_folder, output_folder)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncode