<a href="https://colab.research.google.com/github/mlvssyaswanth/3D-Background/blob/main/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch torchvision torchaudio pillow opencv-python-headless



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import zipfile
import glob
import shutil

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_folder = "/content/drive/MyDrive/OCR_Dataset"  # Update this path
extract_path = "/content/extracted_OCR_Dataset/"  # Temporary extraction folder
output_text_file = "/content/ocr_results.txt"

In [None]:
os.makedirs(extract_path, exist_ok=True)

zip_files = [f for f in os.listdir(dataset_folder) if f.endswith(".zip")]

for zip_file in zip_files:
    zip_path = os.path.join(dataset_folder, zip_file)
    print(f" Extracting: {zip_file}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

print(f" All ZIP files extracted to: {extract_path}")

 Extracting: Components-Real.zip...
 Extracting: Date-Real.zip...
 Extracting: Date-Synth.zip...
 Extracting: Products-Real.zip...
 All ZIP files extracted to: /content/extracted_OCR_Dataset/


In [None]:
# Find All Images (Including Subfolders)
image_files = glob.glob(os.path.join(extract_path, "**", "*.[jp][pn]g"), recursive=True)

print(f"Found {len(image_files)} images in dataset.")
print(" Example image paths:", image_files[:5])

if not image_files:
    raise RuntimeError(" No images found! Check dataset structure.")

Found 131769 images in dataset.
 Example image paths: ['/content/extracted_OCR_Dataset/Date-Synth/images/55156.jpg', '/content/extracted_OCR_Dataset/Date-Synth/images/62308.jpg', '/content/extracted_OCR_Dataset/Date-Synth/images/92161.jpg', '/content/extracted_OCR_Dataset/Date-Synth/images/110046.jpg', '/content/extracted_OCR_Dataset/Date-Synth/images/60880.jpg']


In [None]:
# Preprocess Images for OCR
def preprocess_image(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            print(f" Could not read image: {image_path}")
            return None

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.GaussianBlur(gray, (5, 5), 0)
        processed = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)

        # Resize for better OCR accuracy
        processed = cv2.resize(processed, (1024, 1024), interpolation=cv2.INTER_CUBIC)

        return processed
    except Exception as e:
        print(f" Error processing {image_path}: {e}")
        return None

In [None]:
#  Load TrOCR Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Perform OCR with TrOCR for First 10,000 Images
ocr_results = {}

#  Limit processing to 10,000 images
max_images = min(10000, len(image_files))

for i, img_path in enumerate(image_files[:max_images]):  # Process first 10,000 images
    print(f"🔍 Processing {i+1}/{max_images}: {img_path}")

    # Preprocess the image
    processed_img = preprocess_image(img_path)
    if processed_img is None:
        continue

    # Convert OpenCV image to PIL format
    pil_image = Image.fromarray(processed_img).convert("RGB")

    #  Prepare image for TrOCR
    pixel_values = processor(pil_image, return_tensors="pt").pixel_values.to(device)

    #  Perform OCR
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        extracted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(f"📄 Extracted Text:\n{extracted_text}")

    #  Store result
    ocr_results[img_path] = extracted_text

print(f" Finished processing {max_images} images.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11/2012
🔍 Processing 8335/10000: /content/extracted_OCR_Dataset/Date-Synth/images/97424.jpg
📄 Extracted Text:
09/2018
🔍 Processing 8336/10000: /content/extracted_OCR_Dataset/Date-Synth/images/92048.jpg
📄 Extracted Text:
12-09-2026
🔍 Processing 8337/10000: /content/extracted_OCR_Dataset/Date-Synth/images/37136.jpg
📄 Extracted Text:
2029 01
🔍 Processing 8338/10000: /content/extracted_OCR_Dataset/Date-Synth/images/94748.jpg
📄 Extracted Text:
POPA QUID 22
🔍 Processing 8339/10000: /content/extracted_OCR_Dataset/Date-Synth/images/24667.jpg
📄 Extracted Text:
2206.2015
🔍 Processing 8340/10000: /content/extracted_OCR_Dataset/Date-Synth/images/122738.jpg
📄 Extracted Text:
31/N6V/17@
🔍 Processing 8341/10000: /content/extracted_OCR_Dataset/Date-Synth/images/20530.jpg
📄 Extracted Text:
AUG/29/16
🔍 Processing 8342/10000: /content/extracted_OCR_Dataset/Date-Synth/images/113808.jpg
📄 Extracted Text:
07.APR.25:
🔍 Processing 8343/10000: /c

In [None]:
!pip install python-Levenshtein

import Levenshtein

# ✅ Step 1: Load Ground Truth Data
# You should have a dictionary {image_path: actual_text} for comparison
ground_truth = {
    "/content/extracted_OCR_Dataset/image1.jpg": "This is the correct text",
    "/content/extracted_OCR_Dataset/image2.jpg": "Another example of OCR",
    # Add more image-to-text mappings
}

# ✅ Step 2: Define Accuracy Calculation Function
def calculate_accuracy(ocr_results, ground_truth):
    total_images = len(ground_truth)
    total_cer = 0  # Character Error Rate
    total_wer = 0  # Word Error Rate

    for img_path, actual_text in ground_truth.items():
        if img_path in ocr_results:
            predicted_text = ocr_results[img_path]

            # ✅ Compute Character Error Rate (CER)
            cer = Levenshtein.distance(actual_text, predicted_text) / max(1, len(actual_text))
            total_cer += cer

            # ✅ Compute Word Error Rate (WER)
            actual_words = actual_text.split()
            predicted_words = predicted_text.split()
            wer = Levenshtein.distance(" ".join(actual_words), " ".join(predicted_words)) / max(1, len(actual_words))
            total_wer += wer

    # ✅ Compute Final Accuracy Scores
    avg_cer = (1 - (total_cer / total_images)) * 100
    avg_wer = (1 - (total_wer / total_images)) * 100

    print(f"✅ OCR Accuracy Results:")
    print(f"🎯 Character-Level Accuracy: {avg_cer:.2f}%")
    print(f"🎯 Word-Level Accuracy: {avg_wer:.2f}%")

# ✅ Step 3: Run Accuracy Calculation
calculate_accuracy(ocr_results, ground_truth)

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [None]:
# ✅ Step 10: Save OCR Results to a Text File
with open(output_text_file, "w") as f:
    for img, text in ocr_results.items():
        f.write(f"Image: {img}\nExtracted Text:\n{text}\n")
        f.write("="*50 + "\n")

print(f"✅ OCR results saved to: {output_text_file}")

# ✅ Step 11: Download OCR Results
from google.colab import files
files.download(output_text_file)

✅ OCR results saved to: /content/ocr_results.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ✅ Step 12: Save OCR Results to Google Drive
drive_save_path = "/content/drive/MyDrive/OCR_Results.txt"
shutil.move(output_text_file, drive_save_path)
print(f"✅ OCR results saved to Google Drive at: {drive_save_path}")

✅ OCR results saved to Google Drive at: /content/drive/MyDrive/OCR_Results.txt


In [None]:
# ✅ Define save path
model_save_path = "/content/trained_trocr_model"

# ✅ Save the trained model and processor
model.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)

print(f"✅ Model saved at: {model_save_path}")

✅ Model saved at: /content/trained_trocr_model


In [None]:
import shutil
from google.colab import files

# ✅ Compress the model folder into a ZIP file
shutil.make_archive("trocr_model", 'zip', model_save_path)

# ✅ Download the ZIP file
files.download("trocr_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
drive_save_path = "/content/drive/MyDrive/trained_trocr_model"

# ✅ Move model to Google Drive
shutil.move(model_save_path, drive_save_path)

print(f"✅ Model saved to Google Drive at: {drive_save_path}")

✅ Model saved to Google Drive at: /content/drive/MyDrive/trained_trocr_model
