In [1]:
import cv2
import numpy as np
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# -----------------------------
# Set up device for GPU usage
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# -----------------------------
# Load models
# YOLO (for word detection) – note: YOLO does not support .to(device)
yolo_model = YOLO("./word_Detection.pt")
# TrOCR (for OCR on line images)
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten").to(device)

# -----------------------------
# Load the input image
image_path = "./Post_Mortem.jpg"  # update with your image path
original_image = cv2.imread(image_path)
if original_image is None:
    raise ValueError("Could not load image. Check the file path.")
h, w, _ = original_image.shape

# -----------------------------
# Run YOLO inference to detect words
results = yolo_model(image_path)
detections = results[0].boxes  # Assumes detections are stored here

# Collect each detected word's bounding box
word_boxes = []  # Each entry: (x_min, y_min, x_max, y_max)
for box in detections:
    xyxy = box.xyxy[0].tolist()
    x_min, y_min, x_max, y_max = map(int, xyxy)
    # Ensure coordinates are within image bounds
    x_min = max(0, x_min)
    y_min = max(0, y_min)
    x_max = min(w, x_max)
    y_max = min(h, y_max)
    word_boxes.append((x_min, y_min, x_max, y_max))

# -----------------------------
# Dynamically compute vertical threshold based on average word height
heights = [b[3] - b[1] for b in word_boxes]
avg_height = np.mean(heights) if heights else 20
vertical_threshold = avg_height * 0.6  # Adjust multiplier as needed

# -----------------------------
# Group word boxes into lines based on vertical proximity
lines = []  # Each element will be a list of word boxes
for box in word_boxes:
    x_min, y_min, x_max, y_max = box
    center_y = (y_min + y_max) / 2
    added = False
    # Try to add the box to an existing line group
    for line in lines:
        line_centers = [ (b[1] + b[3]) / 2 for b in line ]
        avg_center = sum(line_centers) / len(line_centers)
        if abs(center_y - avg_center) < vertical_threshold:
            line.append(box)
            added = True
            break
    if not added:
        lines.append([box])

# For each line, sort the boxes from left to right
for i in range(len(lines)):
    lines[i].sort(key=lambda b: b[0])

# -----------------------------
# Merge word boxes in each line and run TrOCR to extract text
line_boxes = []  # Each entry: (x_min, y_min, x_max, y_max, line_text)
for line in lines:
    # Merge: union of all boxes in the line
    x_min_line = min(b[0] for b in line)
    y_min_line = min(b[1] for b in line)
    x_max_line = max(b[2] for b in line)
    y_max_line = max(b[3] for b in line)
    
    # Crop the merged region from the original image
    line_region = original_image[y_min_line:y_max_line, x_min_line:x_max_line]
    line_region_pil = Image.fromarray(cv2.cvtColor(line_region, cv2.COLOR_BGR2RGB))
    
    # Run TrOCR on the line image
    pixel_values = trocr_processor(images=line_region_pil, return_tensors="pt").pixel_values.to(device)
    with torch.no_grad():
        generated_ids = trocr_model.generate(pixel_values)
    line_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    line_boxes.append((x_min_line, y_min_line, x_max_line, y_max_line, line_text))

# -----------------------------
# Sort the lines in reading order (top to bottom)
line_boxes.sort(key=lambda x: x[1])

# -----------------------------
# Post-process: Remove duplicate words between consecutive lines
clean_lines = []
for i, (_, _, _, _, line_text) in enumerate(line_boxes):
    words = line_text.split()
    if i > 0 and clean_lines:
        prev_words = clean_lines[-1].split()
        # If the last word of the previous line equals the first word of the current line (case-insensitive), remove it.
        if prev_words and words and prev_words[-1].lower() == words[0].lower():
            words = words[1:]
    clean_lines.append(" ".join(words))

# Combine the cleaned lines into a final paragraph
final_text = "\n".join(clean_lines)

print("Extracted Paragraph:")
print(final_text)

Using device: cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> 


image 1/1 D:\SBI Life Hack-AI-Thon\Handwritten Extraction\Post_Mortem.jpg: 640x608 136 0s, 70.5ms
Speed: 3.0ms preprocess, 70.5ms inference, 76.5ms postprocess per image at shape (1, 3, 640, 608)
Extracted Paragraph:
a )8 (24 .
What Benson Ferorted . You
R. G. KAR MEDICAL COLLEGE & HOSPITAL , KOLKATA
Date # case and symptoms # Prescription # Die
Immediate demands :
Enquiry and Post mortem to too conducted .
under supervision of judicial magistrate
and Judicial magistrate should be frequent
imagistroare
air the time of boat roofism .
video Rewading should be done frompushsory )
d"from Representative
2 . Post mortem should be done under
supervision of a broad formed , composting of : "
a multiple senior Autopsy subgeons .
# Preference of " Female Faculty from
Foregressive Department
a also involve faculty of topestic
Department of the other institute .
of poor mom should be done by today ( a jafas )
evening .
A post mortum done under different a senior
revisents , on spot of Autopsy .
f