## Installing Libraries

In [20]:
!pip install pytesseract
!pip install streamlit
!pip install easyocr
!pip install transformers
!pip install matplotlib
!pip install opencv-python
!pip install cdifflib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

## 1. Text Extraction

In [46]:
import os
import json
import cv2
from PIL import Image
import easyocr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import pytesseract
import numpy as np
# Load the OCR models
reader = easyocr.Reader(['en'])  # Load the English language model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Define a function to extract text using PyTesseract
def extract_text_pytesseract(image_file):
    image = cv2.imread(image_file)
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb)
    results = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
    return ' '.join(results['text'])

# Define a function to extract text using EasyOCR
def extract_text_easyocr(image_file):
    image = cv2.imread(image_file)
    pil_image = Image.fromarray(image)
    np_array = np.array(pil_image)
    result_easyocr = reader.readtext(np_array)
    return ' '.join([res[1] for res in result_easyocr])

# Define a function to extract text using TrOCR
def extract_text_trocr(image_file):
    image = Image.open(image_file).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Create a dictionary to store the results
results = {}

from tqdm import tqdm

# Iterate over all 75 images
results = {}

for filename in tqdm(os.listdir('selected_images'), desc='Processing images', unit='images'):
    image_file = os.path.join('selected_images', filename)
    pytesseract_text = extract_text_pytesseract(image_file)
    easyocr_text = extract_text_easyocr(image_file)
    trocr_text = extract_text_trocr(image_file)

    results[filename] = {}
    # Store the result
    results[filename]["pytesseract"] = pytesseract_text
    results[filename]["easyocr"] = easyocr_text
    results[filename]["trocr"] = trocr_text


# Create a JSON file with the final results
with open('ocr_results.json', 'w') as f:
    json.dump([{'image_name': filename, 'text': text} for filename, text in results.items()], f, indent=4)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing images: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [01:21<00:00,  1.09s/images]


## 2. Manual inspection, loading/updating  ocr_result.json on streamlit GUI app to fix text conflix

In [48]:
! streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.18.47:8501[0m
[0m
^C
[34m  Stopping...[0m


![image.png](attachment:1550b626-1662-4892-83ad-afb6314bf9ff.png)

## 3. Similarity approch, the closest to the manual annotated text is the winner

In [49]:
import json
from difflib import SequenceMatcher

def string_similarity_voting_system(json_data):
    results = []
    for item in json_data:
        pytesseract_output = item["text"]["pytesseract"]
        easyocr_output = item["text"]["easyocr"]
        trocr_output = item["text"]["trocr"]
        manual_output = item["text"]["manual"]

        # Calculate string similarity scores
        pytesseract_similarity = SequenceMatcher(None, pytesseract_output, manual_output).ratio()
        easyocr_similarity = SequenceMatcher(None, easyocr_output, manual_output).ratio()
        trocr_similarity = SequenceMatcher(None, trocr_output, manual_output).ratio()

        similarities = {"pytesseract": pytesseract_similarity, "easyocr": easyocr_similarity, "trocr": trocr_similarity}

        # Determine the winner
        winner = max(similarities, key=similarities.get)

        # Store the result
        results.append({"image_name": item["image_name"], "text": item["text"][winner]})

    return results


with open("ocr_results.json") as f:
    json_data = json.load(f)

results = string_similarity_voting_system(json_data)

# Create a JSON file with the final results
with open('final_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [50]:
! cat final_results.json

[
    {
        "image_name": "000b92a513b07d1a7838287c2cea3baa7f2307a0_page_1_16.jpg",
        "text": "PowerPoint Access"
    },
    {
        "image_name": "0029db0b87cd8ea0e8a1c19ba75cca343bf196dd_page_3_2.jpg",
        "text": "Master of Technology (Information Systems) CGPA: 8.7"
    },
    {
        "image_name": "000b92a513b07d1a7838287c2cea3baa7f2307a0_page_1_19.jpg",
        "text": "Languages"
    },
    {
        "image_name": "006f2d3583a5835146032764078a22a5b1d82a34_page_2_6.jpg",
        "text": "Manager \u2014 Industrial loT & Analytics"
    },
    {
        "image_name": "001aed822d7ac999bf36e50069086c2c525c1902_page_1_16.jpg",
        "text": "strategies."
    },
    {
        "image_name": "0029db0b87cd8ea0e8a1c19ba75cca343bf196dd_page_2_31.jpg",
        "text": "Backend : Golang"
    },
    {
        "image_name": "002e36ce81b25b2579f584404d644eea5ed298bd_page_1_36.jpg",
        "text": "and procedures."
    },
    {
        "image_name": "000ed2333452da408fab1e0626