<a href="https://colab.research.google.com/github/mlvssyaswanth/colab-files/blob/main/OCR_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch torchvision torchaudio pillow opencv-python-headless

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import zipfile
import glob
import shutil

In [None]:
dataset_folder = "/content/drive/MyDrive/OCR_Dataset"  # Update this path
extract_path = "/content/extracted_OCR_Dataset/"  # Temporary extraction folder
output_text_file = "/content/ocr_results.txt"

In [None]:
os.makedirs(extract_path, exist_ok=True)

zip_files = [f for f in os.listdir(dataset_folder) if f.endswith(".zip")]

for zip_file in zip_files:
    zip_path = os.path.join(dataset_folder, zip_file)
    print(f" Extracting: {zip_file}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

print(f" All ZIP files extracted to: {extract_path}")

In [None]:
# Find All Images (Including Subfolders)
image_files = glob.glob(os.path.join(extract_path, "**", "*.[jp][pn]g"), recursive=True)

print(f"Found {len(image_files)} images in dataset.")
print(" Example image paths:", image_files[:5])

if not image_files:
    raise RuntimeError(" No images found! Check dataset structure.")

In [None]:
# Preprocess Images for OCR
def preprocess_image(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f" Could not read image: {image_path}")
            return None

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.GaussianBlur(gray, (5, 5), 0)
        processed = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)

        # Resize for better OCR accuracy
        processed = cv2.resize(processed, (1024, 1024), interpolation=cv2.INTER_CUBIC)

        return processed
    except Exception as e:
        print(f" Error processing {image_path}: {e}")
        return None

In [None]:
#  Load TrOCR Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(device)

In [None]:
# Perform OCR with TrOCR for First 10,000 Images
ocr_results = {}

#  Limit processing to 10,000 images
max_images = min(10000, len(image_files))

for i, img_path in enumerate(image_files[:max_images]):  # Process first 10,000 images
    print(f"🔍 Processing {i+1}/{max_images}: {img_path}")

    # Preprocess the image
    processed_img = preprocess_image(img_path)
    if processed_img is None:
        continue

    # Convert OpenCV image to PIL format
    pil_image = Image.fromarray(processed_img).convert("RGB")

    #  Prepare image for TrOCR
    pixel_values = processor(pil_image, return_tensors="pt").pixel_values.to(device)

    #  Perform OCR
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        extracted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(f"📄 Extracted Text:\n{extracted_text}")

    #  Store result
    ocr_results[img_path] = extracted_text

print(f" Finished processing {max_images} images.")

In [None]:
!pip install python-Levenshtein

import Levenshtein

# ✅ Step 1: Load Ground Truth Data
# You should have a dictionary {image_path: actual_text} for comparison
ground_truth = {
    "/content/extracted_OCR_Dataset/image1.jpg": "This is the correct text",
    "/content/extracted_OCR_Dataset/image2.jpg": "Another example of OCR",
    # Add more image-to-text mappings
}

# ✅ Step 2: Define Accuracy Calculation Function
def calculate_accuracy(ocr_results, ground_truth):
    total_images = len(ground_truth)
    total_cer = 0  # Character Error Rate
    total_wer = 0  # Word Error Rate

    for img_path, actual_text in ground_truth.items():
        if img_path in ocr_results:
            predicted_text = ocr_results[img_path]

            # ✅ Compute Character Error Rate (CER)
            cer = Levenshtein.distance(actual_text, predicted_text) / max(1, len(actual_text))
            total_cer += cer

            # ✅ Compute Word Error Rate (WER)
            actual_words = actual_text.split()
            predicted_words = predicted_text.split()
            wer = Levenshtein.distance(" ".join(actual_words), " ".join(predicted_words)) / max(1, len(actual_words))
            total_wer += wer

    # ✅ Compute Final Accuracy Scores
    avg_cer = (1 - (total_cer / total_images)) * 100
    avg_wer = (1 - (total_wer / total_images)) * 100

    print(f"✅ OCR Accuracy Results:")
    print(f"🎯 Character-Level Accuracy: {avg_cer:.2f}%")
    print(f"🎯 Word-Level Accuracy: {avg_wer:.2f}%")

# ✅ Step 3: Run Accuracy Calculation
calculate_accuracy(ocr_results, ground_truth)

In [None]:
# ✅ Step 10: Save OCR Results to a Text File
with open(output_text_file, "w") as f:
    for img, text in ocr_results.items():
        f.write(f"Image: {img}\nExtracted Text:\n{text}\n")
        f.write("="*50 + "\n")

print(f"✅ OCR results saved to: {output_text_file}")

# ✅ Step 11: Download OCR Results
from google.colab import files
files.download(output_text_file)

In [None]:
# ✅ Step 12: Save OCR Results to Google Drive
drive_save_path = "/content/drive/MyDrive/OCR_Results.txt"
shutil.move(output_text_file, drive_save_path)
print(f"✅ OCR results saved to Google Drive at: {drive_save_path}")

In [None]:
# ✅ Define save path
model_save_path = "/content/trained_trocr_model"

# ✅ Save the trained model and processor
model.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)

print(f"✅ Model saved at: {model_save_path}")

In [None]:
import shutil
from google.colab import files

# ✅ Compress the model folder into a ZIP file
shutil.make_archive("trocr_model", 'zip', model_save_path)

# ✅ Download the ZIP file
files.download("trocr_model.zip")

In [None]:
drive_save_path = "/content/drive/MyDrive/trained_trocr_model"

# ✅ Move model to Google Drive
shutil.move(model_save_path, drive_save_path)

print(f"✅ Model saved to Google Drive at: {drive_save_path}")