In [18]:
import os
import cv2
from PIL import Image
import matplotlib.pyplot as plt

def read_images(input_folder):
    """
    Reads all images from the input folder.

    Args:
    input_folder (str): Path to the input folder containing images.

    Returns:
    List of image file paths.
    """
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_files.append(os.path.join(input_folder, file))
    return image_files

def draw_boxes(image, boxes, color=(0, 255, 0)):
    for box in boxes:
        if len(box) == 4:  # Ensuring box has the correct format
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2)
    return image

def save_image(output_folder, filename, image):
    os.makedirs(output_folder, exist_ok=True)
    Image.fromarray(image).save(os.path.join(output_folder, filename + '_highlighted.png'))

def save_text(output_folder, filename, text):
    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, filename + '.txt'), 'w') as f:
        f.write(text)


## easyocr_ocr

In [7]:
!pip install python-bidi==0.4.2
!pip install easyocr



In [9]:
import easyocr

def easyocr_ocr(input_folder, output_folder):
    """
    Uses EasyOCR to read text from images in the input folder and save highlighted images and text in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images and text.
    """
    reader = easyocr.Reader(['en'])
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        easyocr_results = reader.readtext(image_rgb)
        easyocr_boxes = []
        text_content = ""
        for result in easyocr_results:
            box = result[0]
            text = result[1]
            confidence = result[2]
            easyocr_boxes.append([int(min(point[0] for point in box)), int(min(point[1] for point in box)),
                                  int(max(point[0] for point in box)), int(max(point[1] for point in box))])
            text_content += f"Text: {text}\nConfidence: {confidence}\n\n"

        output_image = draw_boxes(image_rgb.copy(), easyocr_boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)
        save_text(output_folder, filename, text_content)

easyocr_ocr('/content/In', '/content/easyOCR')



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

## tesseract-ocr

In [13]:
!apt-get install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [19]:
import os
import cv2
from PIL import Image
import pytesseract


def tesseract_ocr(input_folder, output_folder):
    """
    Uses Tesseract OCR to read text from images in the input folder and save highlighted images and text in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images and text.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        d = pytesseract.image_to_data(image_rgb, output_type=pytesseract.Output.DICT)
        n_boxes = len(d['level'])
        text_content = ""
        tesseract_boxes = []

        for i in range(n_boxes):
            if int(d['conf'][i]) > 50:  # Confidence threshold
                (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                tesseract_boxes.append((x, y, x + w, y + h))
                text_content += d['text'][i] + " "
            if d['level'][i] == 5:  # End of a paragraph
                text_content += "\n"

        output_image = draw_boxes(image_rgb.copy(), tesseract_boxes, color=(255, 0, 0))  # Red
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)
        save_text(output_folder, filename, text_content.strip())

tesseract_ocr('/content/In', '/content/pytesseract')


## Paddle OCR

In [23]:
!pip3 install paddlepaddle paddleocr



In [24]:
from paddleocr import PaddleOCR


def paddleocr_ocr(input_folder, output_folder):
    """
    Uses PaddleOCR to read text from images in the input folder and save highlighted images and text in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images and text.
    """
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        paddleocr_results = ocr.ocr(image_file, cls=True)
        paddleocr_boxes = []
        text_content = ""

        for result in paddleocr_results:
            for line in result:
                box = line[0]
                text = line[1][0]
                confidence = line[1][1]
                paddleocr_boxes.append([int(min(point[0] for point in box)), int(min(point[1] for point in box)),
                                        int(max(point[0] for point in box)), int(max(point[1] for point in box))])
                text_content += f"Text: {text}, Confidence: {confidence:.2f}\n"

        output_image = draw_boxes(image_rgb.copy(), paddleocr_boxes, color=(0, 0, 255))  # Blue
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)
        save_text(output_folder, filename, text_content.strip())

paddleocr_ocr('/content/In', '/content/PaddleOCR')

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:16<00:00, 245kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:18<00:00, 545kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:14<00:00, 146kiB/s]

[2024/07/23 04:11:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




[2024/07/23 04:11:37] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.5579695701599121
[2024/07/23 04:11:37] ppocr DEBUG: cls num  : 6, elapsed : 0.059670448303222656
[2024/07/23 04:11:38] ppocr DEBUG: rec_res num  : 6, elapsed : 0.5014560222625732
[2024/07/23 04:11:39] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.7585844993591309
[2024/07/23 04:11:39] ppocr DEBUG: cls num  : 19, elapsed : 0.07429313659667969
[2024/07/23 04:11:52] ppocr DEBUG: rec_res num  : 19, elapsed : 13.349321842193604
[2024/07/23 04:11:53] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.3621659278869629
[2024/07/23 04:11:53] ppocr DEBUG: cls num  : 12, elapsed : 0.031041860580444336
[2024/07/23 04:11:54] ppocr DEBUG: rec_res num  : 12, elapsed : 1.0566926002502441
[2024/07/23 04:11:55] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.3707089424133301
[2024/07/23 04:11:55] ppocr DEBUG: cls num  : 4, elapsed : 0.03740119934082031
[2024/07/23 04:11:55] ppocr DEBUG: rec_res num  : 4, elapsed : 0.39806628227233887
[2024/07/23 04:11:5

##pytesseract with opencv prep

In [29]:
import pytesseract

def preprocess_and_tesseract_ocr(input_folder, output_folder):
    """
    Preprocess images using OpenCV and uses Tesseract OCR to read text from images in the input folder and save highlighted images and text in the output folder.

    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images and text.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = cv2.imread(image_file)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blur = cv2.GaussianBlur(gray, (5, 5), 0)
        _, binary = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Use Tesseract with confidence level output
        tesseract_data = pytesseract.image_to_data(binary, output_type=pytesseract.Output.DICT)
        tesseract_boxes = []
        text_content = ""

        for i in range(len(tesseract_data['level'])):
            (x, y, w, h, text, conf) = (tesseract_data['left'][i], tesseract_data['top'][i],
                                        tesseract_data['width'][i], tesseract_data['height'][i],
                                        tesseract_data['text'][i], tesseract_data['conf'][i])
            if conf != '-1':
                tesseract_boxes.append((x, y, x + w, y + h))
                text_content += f"Text: {text}, Confidence: {conf}\n"

        output_image = draw_boxes(cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB), tesseract_boxes, color=(255, 0, 0))  # Red
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)
        save_text(output_folder, filename, text_content.strip())

preprocess_and_tesseract_ocr('/content/In', '/content/pytesseract_opncv')

## keras_ocr

In [32]:
!pip install keras_ocr

Collecting keras_ocr
  Downloading keras_ocr-0.9.3-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting efficientnet==1.0.0 (from keras_ocr)
  Downloading efficientnet-1.0.0-py3-none-any.whl (17 kB)
Collecting essential_generators (from keras_ocr)
  Downloading essential_generators-1.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting validators (from keras_ocr)
  Downloading validators-0.33.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-applications<=1.0.8,>=1.0.7 (from efficientnet==1.0.0->keras_ocr)
  Downloading Keras_Applications-1.0.8-py3-non

In [35]:
import os
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import keras_ocr

def read_images(input_folder):
    """
    Reads all images from the input folder.
    Args:
    input_folder (str): Path to the input folder containing images.
    Returns:
    List of image file paths.
    """
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            image_files.append(os.path.join(input_folder, file))
    return image_files

def preprocess_image(image):
    """
    Preprocess the image to improve OCR results.
    Args:
    image (numpy array): The input image.
    Returns:
    numpy array: The preprocessed image.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    processed_image = cv2.adaptiveThreshold(resized, 255,
                                            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                            cv2.THRESH_BINARY, 31, 2)
    # Convert back to a 3-channel image
    processed_image = cv2.cvtColor(processed_image, cv2.COLOR_GRAY2BGR)
    return processed_image

def draw_boxes(image, boxes, color=(0, 255, 0)):
    for box in boxes:
        if len(box) == 4:  # Ensuring box has the correct format
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, 2)
    return image

def save_image(output_folder, filename, image):
    os.makedirs(output_folder, exist_ok=True)
    Image.fromarray(image).save(os.path.join(output_folder, filename + '_highlighted.png'))

def save_text(output_folder, filename, text):
    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, filename + '.txt'), 'w') as f:
        f.write(text)

def keras_ocr_function(input_folder, output_folder):
    """
    Uses Keras-OCR to read text from images in the input folder and save highlighted images in the output folder.
    Args:
    input_folder (str): Path to the input folder containing images.
    output_folder (str): Path to the output folder to save highlighted images.
    """
    pipeline = keras_ocr.pipeline.Pipeline()
    image_files = read_images(input_folder)
    for image_file in image_files:
        image = keras_ocr.tools.read(image_file)
        preprocessed_image = preprocess_image(image)
        predictions = pipeline.recognize([preprocessed_image])[0]
        boxes = []
        text_with_confidence = []

        for prediction in predictions:
            text = prediction[0]
            box = prediction[1]
            text_with_confidence.append(f"{text}")

            x1, y1 = int(box[0][0]), int(box[0][1])
            x2, y2 = int(box[2][0]), int(box[2][1])
            boxes.append([x1, y1, x2, y2])

        output_image = draw_boxes(preprocessed_image.copy(), boxes, color=(0, 255, 0))  # Green
        filename = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, filename, output_image)
        save_text(output_folder, filename, "\n".join(text_with_confidence))

# Usage example:
keras_ocr_function('/content/In', 'Keras_ocr')


Looking for /root/.keras-ocr/craft_mlt_25k.h5
Looking for /root/.keras-ocr/crnn_kurapan.h5






In [36]:
!zip -r Keras_ocr.zip /content/Keras_ocr

  adding: content/Keras_ocr/ (stored 0%)
  adding: content/Keras_ocr/12.txt (deflated 49%)
  adding: content/Keras_ocr/5.txt (deflated 31%)
  adding: content/Keras_ocr/4_highlighted.png (deflated 0%)
  adding: content/Keras_ocr/10.txt (deflated 43%)
  adding: content/Keras_ocr/7.txt (deflated 27%)
  adding: content/Keras_ocr/2.txt (deflated 38%)
  adding: content/Keras_ocr/8.txt (deflated 4%)
  adding: content/Keras_ocr/12_highlighted.png (deflated 12%)
  adding: content/Keras_ocr/6_highlighted.png (deflated 0%)
  adding: content/Keras_ocr/9.txt (deflated 8%)
  adding: content/Keras_ocr/6.txt (deflated 36%)
  adding: content/Keras_ocr/3.txt (deflated 26%)
  adding: content/Keras_ocr/7_highlighted.png (deflated 0%)
  adding: content/Keras_ocr/4.txt (deflated 35%)
  adding: content/Keras_ocr/11.txt (deflated 47%)
  adding: content/Keras_ocr/8_highlighted.png (deflated 0%)
  adding: content/Keras_ocr/1_highlighted.png (deflated 0%)
  adding: content/Keras_ocr/11_highlighted.png (deflated 

In [None]:
import os
import cv2
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

def read_images(input_folder):
    """
    Reads image files from the specified input folder.
    Args:
    input_folder (str): Path to the folder containing the input images.
    Returns:
    list: A list of file paths for the images in the input folder.
    """
    image_files = []
    for file in os.listdir(input_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):  # Ensure we read only image files
            image_files.append(os.path.join(input_folder, file))
    return image_files

def save_image(output_folder, filename, image):
    """
    Saves the image to the specified output folder.
    Args:
    output_folder (str): Path to the folder where the image will be saved.
    filename (str): The name of the file to save the image as.
    image (numpy.ndarray): The image to save.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_path = os.path.join(output_folder, f"{filename}.png")
    cv2.imwrite(output_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

def perform_ocr_doctr(image_path):
    """
    Performs OCR on the image using the doctr library.
    Args:
    image_path (str): Path to the image file.
    Returns:
    Document: The OCR result as a doctr Document object.
    """
    model = ocr_predictor(pretrained=True)
    doc = DocumentFile.from_images(image_path)
    result = model(doc)
    return result

def draw_boxes(image_path, result):
    """
    Draws bounding boxes around the recognized text in the image.
    Args:
    image_path (str): Path to the image file.
    result (Document): The OCR result as a doctr Document object.
    Returns:
    numpy.ndarray: The image with bounding boxes drawn around recognized text.
    """
    image = cv2.imread(image_path)
    height, width, _ = image.shape
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    # Extract bounding box coordinates
                    (x_min, y_min), (x_max, y_max) = word.geometry
                    # Convert normalized coordinates to pixel coordinates
                    box = [
                        int(x_min * width), int(y_min * height),
                        int(x_max * width), int(y_max * height)
                    ]
                    # Draw rectangle on the image
                    cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
    return image

def extract_text(result):
    """
    Extracts text content from the OCR result.
    Args:
    result (Document): The OCR result as a doctr Document object.
    Returns:
    str: The extracted text content.
    """
    text_content = ""
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    text_content += word.value + " "
                text_content += "\n"
            text_content += "\n"
        text_content += "\n"
    return text_content

def apply_doctr_model(input_folder, output_folder):
    """
    Applies the doctr OCR model to images in the input folder and saves the results.
    Args:
    input_folder (str): Path to the folder containing the input images.
    output_folder (str): Path to the folder where the results will be saved.
    """
    image_files = read_images(input_folder)
    for image_file in image_files:
        result = perform_ocr_doctr(image_file)

        # Draw bounding boxes on the image
        annotated_image = draw_boxes(image_file, result)

        # Save the annotated image
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        save_image(output_folder, base_name, annotated_image)

        # Extract and save text content to a text file
        text_content = extract_text(result)
        text_file = os.path.join(output_folder, f"{base_name}.txt")
        with open(text_file, 'w') as f:
            f.write(text_content)

# Paths
input_folder = '/content/In'
output_folder = '/content/OCR_Doctr'

# Apply the OCR model
apply_doctr_model(input_folder, output_folder)