In [None]:
!pip install ultralytics
!pip install transformers
!pip install torchvision


Collecting ultralytics
  Downloading ultralytics-8.3.134-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
import os
import zipfile

drive.mount('/content/drive')

zip_path = '/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET.zip'

zip_dir = os.path.dirname(zip_path)
zip_name = os.path.splitext(os.path.basename(zip_path))[0]
extract_folder = os.path.join(zip_dir, f"{zip_name}_unzipped")

# Step 4: Create output folder if it doesn't exist
os.makedirs(extract_folder, exist_ok=True)

# Step 5: Extract ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"Unzipped successfully to: {extract_folder}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Unzipped successfully to: /content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped


In [None]:
import torch
from PIL import Image
import os
import json
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from ultralytics import YOLO

# === Paths ===
image_dir = "/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/IMAGES_750"
yolo_model_path = "/content/drive/MyDrive/Dehaldo/best_yolov8s.pt"
output_dir = "/content/drive/MyDrive/Dehaldo/OCR_OUTPUTS"

os.makedirs(output_dir, exist_ok=True)

# === Load YOLOv8 model ===
yolo_model = YOLO(yolo_model_path)

# === Load TrOCR model and processor ===
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# === Get image filenames and select 10% from start + 10% from end ===
all_images = sorted([f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
num_total = len(all_images)

first_10_percent = all_images[:int(num_total * 0.1)]
last_10_percent = all_images[-int(num_total * 0.1):]
selected_images = first_10_percent + last_10_percent

print(f"Processing {len(selected_images)} images out of {num_total} (10% from start + 10% from end)")

# === Process each selected image ===
for img_name in selected_images:
    image_path = os.path.join(image_dir, img_name)
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Failed to load {image_path}: {e}")
        continue

    # Run YOLOv8 on image
    results = yolo_model(image_path)
    boxes = results[0].boxes.xyxy.cpu().numpy()  # x1, y1, x2, y2
    boxes = [list(map(int, box)) for box in boxes]

    result_list = []

    for box in boxes:
        x1, y1, x2, y2 = box
        cropped = image.crop((x1, y1, x2, y2))

        # Preprocess and predict
        pixel_values = processor(images=cropped, return_tensors="pt").pixel_values.to(device)

        with torch.no_grad():
            generated_ids = model.generate(pixel_values)
            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        result_list.append({
            "box": [x1, y1, x2, y2],
            "text": text
        })

    # Save result to JSON file
    json_filename = os.path.splitext(img_name)[0] + ".json"
    json_path = os.path.join(output_dir, json_filename)
    with open(json_path, "w") as f:
        json.dump({image_path: result_list}, f, indent=4)

    print(f"Saved: {json_path}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

Processing 150 images out of 750 (10% from start + 10% from end)

image 1/1 /content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/IMAGES_750/MIT_1.jpg: 640x480 21 handwrittens, 12.8ms
Speed: 3.2ms preprocess, 12.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 480)
Saved: /content/drive/MyDrive/Dehaldo/OCR_OUTPUTS/MIT_1.json

image 1/1 /content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/IMAGES_750/MIT_10.jpg: 640x480 24 handwrittens, 7.0ms
Speed: 3.1ms preprocess, 7.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 480)
Saved: /content/drive/MyDrive/Dehaldo/OCR_OUTPUTS/MIT_10.json

image 1/1 /content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/IMAGES_750/MIT_100.jpg: 640x480 20 handwrittens, 6.9ms
Speed: 3.0ms preprocess, 6.9ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 480)
Saved: /content/drive/MyDrive/Dehaldo/OCR_OUT

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [None]:
import os
import json
from difflib import SequenceMatcher
from glob import glob

# --- Utility Functions ---
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        return 0.0
    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(areaA + areaB - interArea)

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# --- Paths ---
pred_dir = "/content/drive/MyDrive/Dehaldo/OCR_OUTPUTS/"
gt_dir = "/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/LABELS_750"
output_dir = "/content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/"
os.makedirs(output_dir, exist_ok=True)

# Get all image names
image_names = sorted([os.path.basename(f).replace(".json", "") for f in glob(os.path.join(gt_dir, "*.json"))])
n_total = len(image_names)
first_10_percent = image_names[:int(n_total * 0.1)]
last_10_percent = image_names[int(n_total * 0.9):]

for name in first_10_percent + last_10_percent:
    pred_file = os.path.join(pred_dir, name + ".json")
    gt_file = os.path.join(gt_dir, name + ".json")

    if not os.path.exists(pred_file) or not os.path.exists(gt_file):
        print(f"Skipping {name}: Missing file(s)")
        continue

    with open(pred_file) as f:
        pred_json = json.load(f)
    with open(gt_file) as f:
        gt_entries = json.load(f)

    # Extract predictions (TrOCR format)
    pred_entries = list(pred_json.values())[0] if isinstance(pred_json, dict) else pred_json

    if isinstance(pred_entries[0], str):
        pred_entries = [json.loads(p.replace("'", '"')) for p in pred_entries]

    used_pred_indices = set()
    matched_output = []

    for gt in gt_entries:
        gt_box = gt["Coordinate"]
        gt_text = gt["Field value"].strip()
        field_name = gt.get("Field name", "")  # If field name exists

        best_match, best_score, best_idx = "", 0, -1

        for i, pred in enumerate(pred_entries):
            if i in used_pred_indices:
                continue
            pred_box, pred_text = pred["box"], pred["text"]

            if iou(gt_box, pred_box) > 0.3:
                sim = similarity(gt_text.lower(), pred_text.strip().lower())
                if sim > best_score:
                    best_score, best_match, best_idx = sim, pred_text.strip(), i

        if best_idx != -1:
            used_pred_indices.add(best_idx)
            matched_output.append({
                "field_name": field_name,
                "ground_truth": gt_text,
                "prediction": best_match
            })
        else:
            matched_output.append({
                "field_name": field_name,
                "ground_truth": gt_text,
                "prediction": ""
            })

    # Save to individual JSON file
    out_path = os.path.join(output_dir, name + "_matched.json")
    with open(out_path, "w") as f:
        json.dump(matched_output, f, indent=2, ensure_ascii=False)

    print(f"Saved matched file: {out_path}")


Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_1_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_10_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_100_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_101_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_102_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_103_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_104_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_105_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_106_matched.json
Saved matched file: /content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_108_matched.json
Saved matched

In [None]:
import json
import re
import difflib

def clean_number_spaces(prediction: str) -> str:
    """Remove spaces between digits."""
    return re.sub(r'(?<=\d)\s+(?=\d)', '', prediction)

def clean_date_prediction(prediction: str) -> str:
    """Fix date format: remove spaces and reformat to MM/DD/YYYY."""
    prediction = prediction.replace(' ', '').replace('|', '1')
    match = re.search(r'(\d{1,2})[^\d]*(\d{1,2})[^\d]*(\d{4})', prediction)
    if match:
        return f"{match.group(1).zfill(2)}/{match.group(2).zfill(2)}/{match.group(3)}"
    return prediction

def normalize_fuzzy(prediction: str, valid_list: list) -> str:
    """Return the closest match from a list using difflib."""
    cleaned = prediction.strip().replace('.', '').replace('–', '-').capitalize()
    match = difflib.get_close_matches(cleaned, valid_list, n=1, cutoff=0.5)
    return match[0] if match else prediction

def normalize_languages(prediction: str, valid_languages: list) -> str:
    """Normalize comma-separated language names to the closest valid ones, preserving order."""
    langs = re.split(r'[;,]', prediction)
    seen = set()
    normalized = []

    for lang in langs:
        cleaned = lang.strip().capitalize()
        match = difflib.get_close_matches(cleaned, valid_languages, n=1, cutoff=0.4)
        corrected = match[0] if match else cleaned
        if corrected not in seen:
            normalized.append(corrected)
            seen.add(corrected)

    return ', '.join(normalized)

def normalize_blood_group(prediction: str) -> str:
    """Normalize blood group, handling spaces, 't' or 'f' in front of letters, and + or - signs."""
    prediction = prediction.strip().replace(" ", "").replace(".", "")  # Remove spaces and periods

    # Handle cases where 't' or 'f' is in front of the blood group letter (e.g., 'tA', 'fA' -> 'A+')
    if prediction.lower().startswith("t") or prediction.lower().startswith("f"):
        prediction = prediction[1:]  # Remove the 't' or 'f'

    # Handle cases like "At" or "At." by converting to "A+"
    if prediction.lower() == "at" or prediction.lower() == "at.":
        return "A+"

    # Handle cases like "fA" or "fA."
    if prediction.lower().startswith("a"):
        return "A+"
    if prediction.lower().startswith("b"):
        return "B+"
    if prediction.lower().startswith("ab"):
        return "AB+"
    if prediction.lower().startswith("o"):
        return "O+"

    # Validate against known blood group options
    valid_blood_groups = ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]
    if prediction in valid_blood_groups:
        return prediction

    return "Invalid"  # Return "Invalid" if it's not a valid blood group

def normalize_nationality(prediction: str) -> str:
    """Normalize nationality to 'Indian' if found."""
    if "indian" in prediction.lower():
        return "Indian"
    return prediction

def clean_pancard(prediction: str) -> str:
    """Clean and format Pancard number by removing spaces between alphanumeric characters."""
    return re.sub(r'\s+', '', prediction)

def clean_address(prediction: str) -> str:
    """Clean and standardize address format."""
    prediction = prediction.strip()

    # Fix variations of "H. No" to "H.No."
    prediction = re.sub(r'\bH\s*\.\s*No\b\.?', 'H.No.', prediction, flags=re.IGNORECASE)

    # Remove space between digits and symbols like '/' (e.g., "70 / 9kg" -> "70/9kg")
    prediction = re.sub(r'(?<=\d)\s*/\s*(?=\d)', '/', prediction)

    # Concatenate numbers split by space (e.g., "09 2248" -> "092248")
    prediction = re.sub(r'(?<=\d)\s+(?=\d)', '', prediction)

    # Normalize multiple spaces to single space
    prediction = re.sub(r'\s{2,}', ' ', prediction)

    return prediction.strip()



def postprocess_predictions(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for entry in data:
        field = entry["field_name"].lower()
        prediction = entry["prediction"]

        if "dateofbirth" in field or field == "date":
            cleaned = clean_date_prediction(prediction)

        elif "qualification" in field:
            cleaned = normalize_fuzzy(prediction, [
                "Graduate", "Post-Graduate", "Undergraduate",
                "Diploma", "Doctorate", "Post-Doctorate", "10th Pass", "12th Pass"
            ])

        elif "gender" in field:
            cleaned = normalize_fuzzy(prediction, ["Male", "Female"])

        elif "maritalstatus" in field:
            cleaned = normalize_fuzzy(prediction, [
                "Married", "Single", "Divorced", "Widow"
            ])

        elif "languageknown" in field:
            cleaned = normalize_languages(prediction, [
                "Hindi", "English", "Gujarati", "Marathi", "Telugu",
                "Kannada", "Tamil", "Punjabi", "Bengali", "Urdu"
            ])

        elif "bloodgroup" in field:
            cleaned = normalize_blood_group(prediction)

        elif "nationality" in field:
            cleaned = normalize_nationality(prediction)

        elif "pancard" in field:
            cleaned = clean_pancard(prediction)

        elif "presentaddress" in field or "permanentaddress" in field:
            cleaned = clean_address(prediction)

        else:
            cleaned = clean_number_spaces(prediction)

        if cleaned == "Invalid":
            print(f"Invalid value detected in field '{field}' with prediction '{prediction}'")

        entry["prediction"] = cleaned

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"✅ Cleaned output written to {output_path}")


In [None]:
input_json = "/content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_168_matched.json"
output_json = "/content/drive/MyDrive/Dehaldo/EVALUATION_MATCHED_RESULTS/MIT_168_matched_cleaned.json"

postprocess_predictions(input_json, output_json)


'8134149090'

In [None]:
import os
import json
import re
import difflib
from difflib import SequenceMatcher
from glob import glob

# ---------- UTILITY FUNCTIONS ----------
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        return 0.0
    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(areaA + areaB - interArea)

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# ---------- POSTPROCESSING ----------
def clean_number_spaces(prediction):
    return re.sub(r'(?<=\d)\s+(?=\d)', '', prediction)

def clean_date_prediction(prediction):
    prediction = prediction.replace(' ', '').replace('|', '1')
    match = re.search(r'(\d{1,2})[^\d]*(\d{1,2})[^\d]*(\d{4})', prediction)
    if match:
        return f"{match.group(1).zfill(2)}/{match.group(2).zfill(2)}/{match.group(3)}"
    return prediction

def normalize_fuzzy(prediction, valid_list):
    cleaned = prediction.strip().replace('.', '').replace('–', '-').capitalize()
    match = difflib.get_close_matches(cleaned, valid_list, n=1, cutoff=0.5)
    return match[0] if match else prediction

def normalize_languages(prediction, valid_languages):
    langs = re.split(r'[;,]', prediction)
    seen = set()
    normalized = []
    for lang in langs:
        cleaned = lang.strip().capitalize()
        match = difflib.get_close_matches(cleaned, valid_languages, n=1, cutoff=0.4)
        corrected = match[0] if match else cleaned
        if corrected not in seen:
            normalized.append(corrected)
            seen.add(corrected)
    return ', '.join(normalized)

def normalize_blood_group(prediction):
    prediction = prediction.strip().replace(" ", "").replace(".", "")
    if prediction.lower().startswith(("t", "f")):
        prediction = prediction[1:]
    if prediction.lower() == "at" or prediction.lower().startswith("a"):
        return "A+"
    if prediction.lower().startswith("b"):
        return "B+"
    if prediction.lower().startswith("ab"):
        return "AB+"
    if prediction.lower().startswith("o"):
        return "O+"
    valid = ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]
    return prediction if prediction in valid else "Invalid"

def normalize_nationality(prediction):
    return "Indian" if "indian" in prediction.lower() else prediction

def clean_pancard(prediction):
    return re.sub(r'\s+', '', prediction)

def clean_address(prediction):
    prediction = prediction.strip()
    prediction = re.sub(r'\bH\s*\.\s*No\b\.?', 'H.No.', prediction, flags=re.IGNORECASE)
    prediction = re.sub(r'(?<=\d)\s*/\s*(?=\d)', '/', prediction)
    prediction = re.sub(r'(?<=\d)\s+(?=\d)', '', prediction)
    prediction = re.sub(r'\s{2,}', ' ', prediction)
    return prediction.strip()

def postprocess(field, prediction):
    field = field.lower()
    if "dateofbirth" in field or field == "date":
        return clean_date_prediction(prediction)
    elif "qualification" in field:
        return normalize_fuzzy(prediction, ["Graduate", "Post-Graduate", "Undergraduate", "Diploma", "Doctorate", "Post-Doctorate", "10th Pass", "12th Pass"])
    elif "gender" in field:
        return normalize_fuzzy(prediction, ["Male", "Female"])
    elif "maritalstatus" in field:
        return normalize_fuzzy(prediction, ["Married", "Single", "Divorced", "Widow"])
    elif "languageknown" in field:
        return normalize_languages(prediction, ["Hindi", "English", "Gujarati", "Marathi", "Telugu", "Kannada", "Tamil", "Punjabi", "Bengali", "Urdu"])
    elif "bloodgroup" in field:
        return normalize_blood_group(prediction)
    elif "nationality" in field:
        return normalize_nationality(prediction)
    elif "pancard" in field:
        return clean_pancard(prediction)
    elif "presentaddress" in field or "permanentaddress" in field:
        return clean_address(prediction)
    else:
        return clean_number_spaces(prediction)

# ---------- MAIN LOGIC ----------
pred_dir = "/content/drive/MyDrive/Dehaldo/OCR_OUTPUTS/"
gt_dir = "/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/LABELS_750"
output_dir = "/content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS"
os.makedirs(output_dir, exist_ok=True)

image_names = sorted([os.path.basename(f).replace(".json", "") for f in glob(os.path.join(gt_dir, "*.json"))])
n_total = len(image_names)
first_10_percent = image_names[:int(n_total * 0.1)]
last_10_percent = image_names[int(n_total * 0.9):]

for name in first_10_percent + last_10_percent:
    pred_file = os.path.join(pred_dir, name + ".json")
    gt_file = os.path.join(gt_dir, name + ".json")

    if not os.path.exists(pred_file) or not os.path.exists(gt_file):
        print(f"Skipping {name}: Missing file(s)")
        continue

    with open(pred_file) as f:
        pred_json = json.load(f)
    with open(gt_file) as f:
        gt_entries = json.load(f)

    pred_entries = list(pred_json.values())[0] if isinstance(pred_json, dict) else pred_json
    if isinstance(pred_entries[0], str):
        pred_entries = [json.loads(p.replace("'", '"')) for p in pred_entries]

    used_pred_indices = set()
    final_output = []

    for gt in gt_entries:
        gt_box = gt["Coordinate"]
        gt_text = gt["Field value"].strip()
        field_name = gt.get("Field name", "")

        best_match, best_score, best_idx, best_box = "", 0, -1, None

        for i, pred in enumerate(pred_entries):
            if i in used_pred_indices:
                continue
            pred_box, pred_text = pred["box"], pred["text"]

            if iou(gt_box, pred_box) > 0.3:
                sim = similarity(gt_text.lower(), pred_text.strip().lower())
                if sim > best_score:
                    best_score = sim
                    best_match = pred_text.strip()
                    best_idx = i
                    best_box = pred_box

        if best_idx != -1:
            used_pred_indices.add(best_idx)
            cleaned_text = postprocess(field_name, best_match)
            if cleaned_text:
                final_output.append({
                    "text": cleaned_text,
                    "bbox": pred_entries[best_idx]["box"]
                })

    out_path = os.path.join(output_dir, name + ".json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(final_output, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved: {out_path}")


✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_1.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_10.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_100.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_101.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_102.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_103.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_104.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_105.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_106.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_108.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_11.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_110.json
✅ Saved: /content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS/MIT_111.json
✅ Saved: /content/drive/MyDri

In [None]:
!pip install textblob pyspellchecker
!python -m textblob.download_corpora


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk

In [None]:
import os
import json
from difflib import SequenceMatcher
from jiwer import wer, cer
from glob import glob

# --- Utility Functions ---
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        return 0.0
    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(areaA + areaB - interArea)

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# --- Paths ---
#pred_dir = "/content/drive/MyDrive/Dehaldo/OCR_OUTPUTS/"
pred_dir = "/content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS"
gt_dir = "/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/LABELS_750"

# Get all image names
image_names = sorted([os.path.basename(f).replace(".json", "") for f in glob(os.path.join(gt_dir, "*.json"))])
n_total = len(image_names)

# First and last 10% of images
first_10_percent = image_names[:int(n_total * 0.1)]
last_10_percent = image_names[int(n_total * 0.9):]

# --- Initialize Metrics ---
total_exact_matches = 0
total_fields = 0
total_doc_matches = 0
all_gt_texts = []
all_pred_texts = []
similarity_sum = 0
similarity_count = 0
cer_per_image = {}

# --- Process First 10% and Last 10% of Images ---
for name in first_10_percent + last_10_percent:
    pred_file = os.path.join(pred_dir, name + ".json")
    gt_file = os.path.join(gt_dir, name + ".json")

    if not os.path.exists(pred_file) or not os.path.exists(gt_file):
        print(f"Skipping {name}: Missing file(s)")
        continue

    with open(pred_file) as f:
        pred_json = json.load(f)
    with open(gt_file) as f:
        gt_entries = json.load(f)

    # Extract predictions (TrOCR format)
    pred_entries = list(pred_json.values())[0] if isinstance(pred_json, dict) else pred_json

    if isinstance(pred_entries[0], str):  # If stringified dicts
        pred_entries = [json.loads(p.replace("'", '"')) for p in pred_entries]

    matched, used_pred_indices = [], set()
    gt_texts, pred_texts = [], []
    exact_matches = 0

    for gt in gt_entries:
        gt_box = gt["Coordinate"]
        gt_text = gt["Field value"]
        best_match, best_score, best_idx = "", 0, -1

        for i, pred in enumerate(pred_entries):
            if i in used_pred_indices:
                continue
            pred_box, pred_text = pred["bbox"], pred["text"]

            if iou(gt_box, pred_box) > 0.3:
                sim = similarity(gt_text.strip().lower(), pred_text.strip().lower())
                if sim > best_score:
                    best_score, best_match, best_idx = sim, pred_text, i

        gt_clean = gt_text.strip().lower()
        if best_idx != -1:
            used_pred_indices.add(best_idx)
            pred_clean = best_match.strip().lower()
            gt_texts.append(gt_clean)
            pred_texts.append(pred_clean)
            similarity_sum += similarity(gt_clean, pred_clean)
            similarity_count += 1
            if gt_clean == pred_clean:
                exact_matches += 1
        else:
            gt_texts.append(gt_clean)
            pred_texts.append("")
            similarity_sum += 0.0
            similarity_count += 1

    # Accumulate metrics
    total_exact_matches += exact_matches
    total_fields += len(gt_entries)
    all_gt_texts.extend(gt_texts)
    all_pred_texts.extend(pred_texts)
    if exact_matches == len(gt_entries):
        total_doc_matches += 1

    # Compute and store per-image CER
    image_cer = cer(gt_texts, pred_texts)
    cer_per_image[name] = image_cer

    print(f"[{name}] Exact Match: {exact_matches}/{len(gt_entries)}  CER: {image_cer:.3f}")

# --- Final Metrics ---
print("\n========== Overall Evaluation ==========")
print(f"Total Images Evaluated       : {len(cer_per_image)}")
print(f"Total Fields                 : {total_fields}")
print(f"Total Exact Text Matches     : {total_exact_matches}")
print(f"Text Field Accuracy          : {total_exact_matches / total_fields:.3f}" if total_fields else "N/A")
print(f"Document-Level Accuracy      : {total_doc_matches / len(cer_per_image):.3f}" if cer_per_image else "N/A")
print(f"Word Error Rate (WER)        : {wer(all_gt_texts, all_pred_texts):.3f}")
print(f"Character Error Rate (CER)   : {cer(all_gt_texts, all_pred_texts):.3f}")
print(f"Average Text Similarity      : {similarity_sum / similarity_count:.3f}" if similarity_count else "N/A")

# --- Top 5 images with lowest CER ---
sorted_cer = sorted(cer_per_image.items(), key=lambda x: x[1])
print("\nTop 5 Images with Best CER:")
for name, c in sorted_cer[:5]:
    print(f"{name}: CER = {c:.3f}")


[MIT_1] Exact Match: 9/20  CER: 0.128
[MIT_10] Exact Match: 5/20  CER: 0.244
[MIT_100] Exact Match: 7/20  CER: 0.155
[MIT_101] Exact Match: 6/20  CER: 0.214
[MIT_102] Exact Match: 4/20  CER: 0.235
[MIT_103] Exact Match: 3/20  CER: 0.345
[MIT_104] Exact Match: 4/20  CER: 0.276
[MIT_105] Exact Match: 4/20  CER: 0.346
[MIT_106] Exact Match: 9/20  CER: 0.106
[MIT_108] Exact Match: 4/20  CER: 0.338
[MIT_11] Exact Match: 4/20  CER: 0.323
[MIT_110] Exact Match: 5/19  CER: 0.305
[MIT_111] Exact Match: 9/19  CER: 0.188
[MIT_113] Exact Match: 1/17  CER: 0.741
[MIT_114] Exact Match: 1/20  CER: 0.573
[MIT_115] Exact Match: 2/20  CER: 0.340
[MIT_116] Exact Match: 5/20  CER: 0.297
[MIT_117] Exact Match: 9/20  CER: 0.211
[MIT_118] Exact Match: 6/21  CER: 0.262
[MIT_119] Exact Match: 10/20  CER: 0.121
[MIT_12] Exact Match: 5/20  CER: 0.231
[MIT_120] Exact Match: 8/20  CER: 0.151
[MIT_121] Exact Match: 4/20  CER: 0.224
[MIT_122] Exact Match: 8/21  CER: 0.202
[MIT_123] Exact Match: 10/20  CER: 0.198
[MI

In [None]:
import os
import json
from difflib import SequenceMatcher
from jiwer import wer, cer
from glob import glob

# --- Utility Functions ---
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    if interArea == 0:
        return 0.0
    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(areaA + areaB - interArea)

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# --- Paths ---
pred_dir = "/content/drive/MyDrive/Dehaldo/POSTPROCESSED_OUTPUTS"
gt_dir = "/content/drive/MyDrive/Dehaldo/DEHADO-AI_TRAINING_DATASET_unzipped/DEHADO-AI_TRAINING_DATASET/LABELS_750"

# Get all image names
image_names = sorted([os.path.basename(f).replace(".json", "") for f in glob(os.path.join(gt_dir, "*.json"))])
n_total = len(image_names)

# First and last 10% of images
first_10_percent = image_names[:int(n_total * 0.1)]
last_10_percent = image_names[int(n_total * 0.9):]

# --- Initialize Metrics ---
total_exact_matches = 0
total_fields = 0
total_doc_matches = 0
all_gt_texts = []
all_pred_texts = []
similarity_sum = 0
similarity_count = 0
cer_per_image = {}

# --- Process Images ---
for name in first_10_percent + last_10_percent:
    pred_file = os.path.join(pred_dir, name + ".json")
    gt_file = os.path.join(gt_dir, name + ".json")

    if not os.path.exists(pred_file) or not os.path.exists(gt_file):
        print(f"Skipping {name}: Missing file(s)")
        continue

    with open(pred_file) as f:
        pred_json = json.load(f)
    with open(gt_file) as f:
        gt_entries = json.load(f)

    # Extract predictions (TrOCR format)
    pred_entries = list(pred_json.values())[0] if isinstance(pred_json, dict) else pred_json
    if isinstance(pred_entries[0], str):
        pred_entries = [json.loads(p.replace("'", '"')) for p in pred_entries]

    matched, used_pred_indices = [], set()
    gt_texts, pred_texts = [], []
    exact_matches = 0

    for gt in gt_entries:
        gt_box = gt["Coordinate"]
        gt_text = gt["Field value"]
        best_match, best_score, best_idx = "", 0, -1

        for i, pred in enumerate(pred_entries):
            if i in used_pred_indices:
                continue
            pred_box, pred_text = pred["bbox"], pred["text"]

            if iou(gt_box, pred_box) > 0.3:
                sim = similarity(gt_text.strip().lower(), pred_text.strip().lower())
                if sim > best_score:
                    best_score, best_match, best_idx = sim, pred_text, i

        gt_clean = gt_text.strip().lower()
        if best_idx != -1:
            used_pred_indices.add(best_idx)
            pred_clean = best_match.strip().lower()
            gt_texts.append(gt_clean)
            pred_texts.append(pred_clean)
            similarity_sum += similarity(gt_clean, pred_clean)
            similarity_count += 1
            if gt_clean == pred_clean:
                exact_matches += 1
        else:
            gt_texts.append(gt_clean)
            pred_texts.append("")
            similarity_sum += 0.0
            similarity_count += 1

    # Accumulate metrics
    total_exact_matches += exact_matches
    total_fields += len(gt_entries)
    all_gt_texts.extend(gt_texts)
    all_pred_texts.extend(pred_texts)

    match_percentage = exact_matches / len(gt_entries)
    if match_percentage == 1:
        total_doc_matches += 1

    # Compute and store per-image CER
    image_cer = cer(gt_texts, pred_texts)
    cer_per_image[name] = image_cer

    print(f"[{name}] Exact Match: {exact_matches}/{len(gt_entries)}  CER: {image_cer*100:.2f}%")

# --- Final Metrics ---
wer_score = wer(all_gt_texts, all_pred_texts)
cer_score = cer(all_gt_texts, all_pred_texts)
field_acc = total_exact_matches / total_fields if total_fields else 0
doc_acc = total_doc_matches / len(cer_per_image) if cer_per_image else 0
avg_sim = similarity_sum / similarity_count if similarity_count else 0

print("\n========== Overall Evaluation ==========")
print(f"Total Images Evaluated       : {len(cer_per_image)}")
print(f"Total Fields                 : {total_fields}")
print(f"Total Exact Text Matches     : {total_exact_matches}")
print(f"Text Field Accuracy          : {field_acc * 100:.2f}%")
print(f"Document-Level Accuracy      : {doc_acc * 100:.2f}%")
print(f"Word Error Rate (WER)        : {wer_score * 100:.2f}%")
print(f"Character Error Rate (CER)   : {cer_score * 100:.2f}%")
print(f"Average Text Similarity      : {avg_sim * 100:.2f}%")

# --- Final Score ---
final_score = (0.35 * (100 - wer_score * 100) +
               0.35 * (100 - cer_score * 100) +
               0.15 * (field_acc * 100) +
               0.15 * (doc_acc * 100))

print(f"\nFinal Composite Score        : {final_score:.2f}/100")

# --- Top 5 images with best CER ---
sorted_cer = sorted(cer_per_image.items(), key=lambda x: x[1])
print("\nTop 5 Images with Best CER:")
for name, c in sorted_cer[:5]:
    print(f"{name}: CER = {c * 100:.2f}%")


[MIT_1] Exact Match: 9/20  CER: 12.77%
[MIT_10] Exact Match: 5/20  CER: 24.42%
[MIT_100] Exact Match: 7/20  CER: 15.46%
[MIT_101] Exact Match: 6/20  CER: 21.41%
[MIT_102] Exact Match: 4/20  CER: 23.51%
[MIT_103] Exact Match: 3/20  CER: 34.49%
[MIT_104] Exact Match: 4/20  CER: 27.63%
[MIT_105] Exact Match: 4/20  CER: 34.64%
[MIT_106] Exact Match: 9/20  CER: 10.56%
[MIT_108] Exact Match: 4/20  CER: 33.80%
[MIT_11] Exact Match: 4/20  CER: 32.31%
[MIT_110] Exact Match: 5/19  CER: 30.49%
[MIT_111] Exact Match: 9/19  CER: 18.75%
[MIT_113] Exact Match: 1/17  CER: 74.15%
[MIT_114] Exact Match: 1/20  CER: 57.27%
[MIT_115] Exact Match: 2/20  CER: 34.00%
[MIT_116] Exact Match: 5/20  CER: 29.67%
[MIT_117] Exact Match: 9/20  CER: 21.07%
[MIT_118] Exact Match: 6/21  CER: 26.18%
[MIT_119] Exact Match: 10/20  CER: 12.07%
[MIT_12] Exact Match: 5/20  CER: 23.10%
[MIT_120] Exact Match: 8/20  CER: 15.08%
[MIT_121] Exact Match: 4/20  CER: 22.39%
[MIT_122] Exact Match: 8/21  CER: 20.24%
[MIT_123] Exact Matc