In [None]:
%matplotlib inline

# Prerequisites and imports

In [None]:
!apt-get install -y lz4
!pip install matplotlib
!pip install gdown
!pip install paddleocr
!pip install paddlepaddle
!pip install editdistance
!pip install yolov5
!pip install ultralytics
!pip install albumentations

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  lz4
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 90.0 kB of archives.
After this operation, 236 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 lz4 amd64 1.9.3-2build2 [90.0 kB]
Fetched 90.0 kB in 1s (128 kB/s)
Selecting previously unselected package lz4.
(Reading database ... 126281 files and directories currently installed.)
Preparing to unpack .../lz4_1.9.3-2build2_amd64.deb ...
Unpacking lz4 (1.9.3-2build2) ...
Setting up lz4 (1.9.3-2build2) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting paddleocr
  Downloading paddleocr-3.1.0-py3-none-any.whl.metadata (22 kB)
Collecting paddlex>=3.1.0 (from paddlex[ie,multimodal,ocr,trans]>=3.1.0->paddleocr)
  Downloading paddlex-3.1.3-py3-none-any.whl.metadata (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import zipfile
import subprocess
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as TF
import torch.nn.functional as F
from PIL import Image, ImageDraw, ImageFont
import random
from tqdm import tqdm
from pathlib import Path
import cv2
import warnings
import editdistance
import yolov5
from paddleocr import PaddleOCR
import albumentations as A
from albumentations.pytorch import ToTensorV2
import shutil
from matplotlib import font_manager

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")
FONT_PATH = f"{BASE_DIR}/NotoSansTC-VariableFont_wght.ttf"
YOLO_MODEL = "yolov5s"

BASE_DIR: /content


# Download and decompress dataset

In [None]:
!gdown --folder https://drive.google.com/drive/folders/19lbkl8seJ56jQuj7hVq5sv5Dj-G4RvM-?usp=sharing

Retrieving folder contents
Processing file 1_u2OTTt2l81jIhS0PTP6ekpfkchfHdoZ NotoSansTC-VariableFont_wght.ttf
Processing file 1isPOnFzFXBd35Rk_bWiVjpVcCVssO_0q ccpd_train.tar.lz4
Processing file 1SIeRKfz7JvpXPG-VvDxhs1ry0N4m-zyy ccpd_val.tar.lz4
Processing file 1Smvr3gTDAed6K6mW5yZDSx--bsEU6Gat NotoSansSC-VariableFont_wght.ttf
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1_u2OTTt2l81jIhS0PTP6ekpfkchfHdoZ
To: /content/CCPD/NotoSansTC-VariableFont_wght.ttf
100% 11.9M/11.9M [00:00<00:00, 48.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1isPOnFzFXBd35Rk_bWiVjpVcCVssO_0q
From (redirected): https://drive.google.com/uc?id=1isPOnFzFXBd35Rk_bWiVjpVcCVssO_0q&confirm=t&uuid=e69e52cf-1c46-4015-bb2e-e19973d0c46f
To: /content/CCPD/ccpd_train.tar.lz4
100% 6.57G/6.57G [01:39<00:00, 66.3MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1SIeRKfz7JvpXP

Decompress split dataset

In [None]:
if not os.path.exists(f"{BASE_DIR}/CCPD2019"):
    os.makedirs(f"{BASE_DIR}/CCPD2019")
print(f"{BASE_DIR}/CCPD/ccpd_train.tar.lz4")
subprocess.run(["tar", "--use-compress-program=lz4", "-xvf", f"{BASE_DIR}/CCPD/ccpd_train.tar.lz4", "-C", f"{BASE_DIR}/CCPD2019"])
subprocess.run(["tar", "--use-compress-program=lz4", "-xvf", f"{BASE_DIR}/CCPD/ccpd_val.tar.lz4", "-C", f"{BASE_DIR}/CCPD2019"])

/content/CCPD/ccpd_train.tar.lz4


CompletedProcess(args=['tar', '--use-compress-program=lz4', '-xvf', '/content/CCPD/ccpd_val.tar.lz4', '-C', '/content/CCPD2019'], returncode=0)

# Set parameters

In [None]:
NUM_SAMPLES = 10000
NUM_EPOCHS = 30
BATCH_SIZE = 32
IOU_THRESHOLD = 0.5
LEARNING_RATE = 1e-3

# Dataset preparation

In [None]:
TRAIN_DATASET = f'{BASE_DIR}/CCPD2019/ccpd_train'
VAL_DATASET = f'{BASE_DIR}/CCPD2019/ccpd_val'
print(f"TRAIN_DATASET: {TRAIN_DATASET}")
print(f"VAL_DATASET: {VAL_DATASET}")

TRAIN_DATASET: /content/CCPD2019/ccpd_train
VAL_DATASET: /content/CCPD2019/ccpd_val


In [None]:
FILENAME_SPLITTER = '&'

class CCPDDataset(Dataset):
    def __init__(self, data_dir, split_file=None, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        all_image_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.jpg')]

        print(f"Found {len(all_image_files)} image files in the data directory.")

        # If a split file is provided, filter the image files
        if split_file:
            if not os.path.exists(split_file):
                print(f"Warning: Split file not found at {split_file}")
                self.image_files = [] # No files if split file is missing
            else:
                print("Splitting")
                with open(split_file, 'r') as f:
                    # Read filenames from the split file, removing leading/trailing whitespace
                    # and removing 'ccpd_blur/' prefix
                    split_filenames = [line.strip().replace('ccpd_base/', '') for line in f if line.strip()]


                # Filter image files to include only those in the split file
                self.image_files = [f for f in all_image_files if os.path.basename(f) in split_filenames]
                print(f"Found {len(self.image_files)} image files matching the split file.")
        else:
            self.image_files = all_image_files
            print("No split file provided, using all image files.")


        # Define the character mappings
        self.provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
        self.alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
                          'X', 'Y', 'Z', 'O']
        self.ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

    def __len__(self):
        return len(self.image_files)
    """
    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert('RGB')
        bbox, char_indices = self._extract_annotations_from_filename(os.path.basename(img_path))

        # Convert character indices to the license plate string
        license_plate_string = self._indices_to_string(char_indices)

        img_width, img_height = image.size

        # Apply transforms to the image
        if self.transform:
            image = self.transform(image) # Make sure this line is present and indented correctly

        bbox = [bbox[0] / img_width, bbox[1] / img_height, bbox[2] / img_width, bbox[3] / img_height]
        bbox = torch.tensor(bbox, dtype=torch.float32)

        return image, bbox, license_plate_string, torch.tensor(char_indices, dtype=torch.long) # Optionally return indices as well
    """
    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert('RGB')
        bbox, char_indices = self._extract_annotations_from_filename(os.path.basename(img_path))

        license_plate_string = self._indices_to_string(char_indices)

        # convert PIL to numpy
        image_np = np.array(image)

        img_h, img_w = image_np.shape[:2]

        # Albumentations expects absolute pixel coordinates
        bbox_abs = [
            bbox[0],
            bbox[1],
            bbox[2],
            bbox[3]
        ]

        if self.transform:
            transformed = self.transform(
                image=image_np,
                bboxes=[bbox_abs],   # ABSOLUTE pixel coords
                class_labels=[0]
            )
            image = transformed['image']

            if len(transformed['bboxes']) == 0:
                # fallback
                bbox_transformed = bbox_abs
            else:
                bbox_transformed = transformed['bboxes'][0]

            # Albumentations resizes to 416×416, so normalize relative to 416
            bbox_final = [
                bbox_transformed[0] / 416,
                bbox_transformed[1] / 416,
                bbox_transformed[2] / 416,
                bbox_transformed[3] / 416,
            ]

            bbox = torch.tensor(bbox_final, dtype=torch.float32)

        else:
            # if no transform: normalize relative to original
            bbox_norm = [
                bbox_abs[0] / img_w,
                bbox_abs[1] / img_h,
                bbox_abs[2] / img_w,
                bbox_abs[3] / img_h,
            ]
            image = transforms.ToTensor()(image)
            bbox = torch.tensor(bbox_norm, dtype=torch.float32)

        return image, bbox, license_plate_string, torch.tensor(char_indices, dtype=torch.long)

    def _extract_annotations_from_filename(self, filename):
        # Remove the file extension
        filename_no_ext = os.path.splitext(filename)[0]
        # print("\nfilename_no_ext", filename_no_ext)

        # Split the filename into the seven fields
        parts = filename_no_ext.split('-')

       # Extract Bounding box coordinates (field 3, index 2)
        # The format is "x1&y1_x2&y2" (left-up and right-bottom)
        bbox_coords_str = parts[2].split('_')
        # print("bbox_coords_str", bbox_coords_str) # Added for debugging

        # Split by '&' to get x and y for left-up and right-bottom
        left_up = [int(coord) for coord in bbox_coords_str[0].split(FILENAME_SPLITTER)] # should split on &
        right_bottom = [int(coord) for coord in bbox_coords_str[1].split(FILENAME_SPLITTER)] # should split on &

        # Bounding box in (x1, y1, x2, y2) format
        bbox = (left_up[0], left_up[1], right_bottom[0], right_bottom[1])
        # print("Bounding box: (", left_up[0], ",", left_up[1], ",", right_bottom[0], ",", right_bottom[1], ")") # Added for debugging

        # Extract Four vertices locations (field 4, index 3)
        # The format is "x1&y1_x2&y2_x3&y3_x4&y4" (starting from right-bottom)
        vertices_str = parts[3].split('_')
        vertices = []
        for vertex_str in vertices_str:
            vertices.append([int(coord) for coord in vertex_str.split(FILENAME_SPLITTER)]) # should split on &
        # vertices will be a list of [x, y] pairs

        # Extract License plate number (field 5, index 4)
        # The format is "0_0_22_27_27_33_16" (indices of characters)
        char_indices_str = parts[4].split('_') # Corrected index to 4
        char_indices = [int(index) for index in char_indices_str]
        # print("Char indices:", char_indices) # Added for debugging

        # Extract Brightness (field 6, index 5)
        brightness = int(parts[5]) # Corrected index to 5

        # Extract Blurriness (field 7, index 6)
        blurriness = int(parts[6]) # Corrected index to 6

        # You can also extract Area (field 1, index 0) and Tilt degree (field 2, index 1) if needed.
        # area = float(parts[0])
        # tilt_degree = [int(deg) for deg in parts[1].split('_')]


        # Return the extracted information.
        # For a baseline model, you'll likely need the bounding box for detection
        # and the character indices for recognition.
        return bbox, char_indices

    def _indices_to_string(self, char_indices):
        # Map indices to characters based on their position
        # The format is province, alphabet, and then five from ads
        license_plate = ""
        if len(char_indices) > 0:
            license_plate += self.provinces[char_indices[0]]
        if len(char_indices) > 1:
            license_plate += self.alphabets[char_indices[1]]
        for i in range(2, min(len(char_indices), 7)): # Assuming 7 characters total
            license_plate += self.ads[char_indices[i]]

        return license_plate

In [None]:
class SmallSubsetDataset(torch.utils.data.Dataset):
    def __init__(self, base_dataset, max_samples=100, random_subset=False):
        self.base_dataset = base_dataset
        self.max_samples = min(max_samples, len(base_dataset))

        if random_subset:
            self.image_files = random.sample(base_dataset.image_files, self.max_samples)
        else:
            self.image_files = base_dataset.image_files[:self.max_samples]

        self.provinces = base_dataset.provinces
        self.alphabets = base_dataset.alphabets
        self.ads = base_dataset.ads

    def __len__(self):
        return self.max_samples

    def __getitem__(self, idx):
        # locate the original dataset index
        img_file = self.image_files[idx]
        original_idx = self.base_dataset.image_files.index(img_file)
        return self.base_dataset[original_idx]
    def _indices_to_string(self, indices):
        return self.base_dataset._indices_to_string(indices)

In [None]:
# Define your transforms
height = 416
width = 416

"""
transform = transforms.Compose([
transforms.Resize((height, width)), # Resize to a fixed size (choose appropriate dimensions)
transforms.ToTensor(), # Convert PIL Image to PyTorch tensor
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet stats - adjust if needed
# Add data augmentation transforms as well (e.g., RandomRotation, ColorJitter)
])

transform_val = transforms.Compose([
transforms.Resize((height, width)), # Resize to a fixed size (choose appropriate dimensions)
transforms.ToTensor(), # Convert PIL Image to PyTorch tensor
])
"""

transform_train = A.Compose([
    A.Resize(height=416, width=416),  # fixed image size
    A.OneOf([
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.GaussianBlur(p=0.1),
        A.GaussNoise(p=0.2)
    ], p=0.3),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.02, scale_limit=0.1, rotate_limit=5,
        border_mode=0, p=0.5
    ),
    A.HorizontalFlip(p=0.5),
    A.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225)
    ),
    ToTensorV2()
],
    bbox_params=A.BboxParams(
        format='pascal_voc',
        label_fields=['class_labels'],
        min_visibility=0.0,            # keeps bboxes with at least 30% visible area
        #filter_lost_elements=True      # filters out bboxes lost after transform
    )
)


transform_val = A.Compose([
    A.Resize(height=416, width=416),
    A.Normalize(mean=(0.485, 0.456, 0.406),
                std=(0.229, 0.224, 0.225)),
    ToTensorV2()
],
    bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels'])
)

In [None]:
train_dataset = CCPDDataset(data_dir=TRAIN_DATASET, transform=transform_train)
val_dataset = CCPDDataset(data_dir=VAL_DATASET, transform=transform_val)

Found 100000 image files in the data directory.
No split file provided, using all image files.
Found 99996 image files in the data directory.
No split file provided, using all image files.


If you want to take a subsample

In [None]:
train_dataset = SmallSubsetDataset(train_dataset, max_samples=NUM_SAMPLES)
val_dataset = SmallSubsetDataset(val_dataset, max_samples=NUM_SAMPLES, random_subset=True)
print(f"Subset of train dataset has size {len(train_dataset)}")
print(f"Subset of evaluation dataset has size {len(val_dataset)}")

Subset of train dataset has size 1000
Subset of evaluation dataset has size 1000


In [None]:
def custom_collate(batch):
    images = torch.stack([item[0] for item in batch])
    bboxes = torch.stack([item[1] for item in batch])
    license_plate_strings = [item[2] for item in batch]  # list of strings, keep as list
    char_indices = torch.stack([item[3] for item in batch])

    return images, bboxes, license_plate_strings, char_indices

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

print(f"Number of images in the training dataset: {len(train_dataset)}")
print(f"Number of images in the evaluation dataset: {len(val_dataset)}")

Number of images in the training dataset: 1000
Number of images in the evaluation dataset: 1000


# Visualize the dataset

In [None]:
# Access a few samples directly from the dataset
num_samples_to_check = 5 # Number of samples to inspect

# Function to display a tensor image
def imshow(img):
    # Unnormalize the image using the mean and std from the transform
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = std * img.numpy().transpose((1, 2, 0)) + mean
    img = np.clip(img, 0, 1) # Clip values to be between 0 and 1
    plt.imshow(img)
    plt.axis('off') # Hide axes
    plt.show()

print("\n\n---*** TRAINING DATASET ***---")
for i in range(num_samples_to_check):
    image, bbox, license_plate_string, char_indices_tensor = train_dataset[i]

    print(f"\n--- Sample {i} ---")
    print("Original filename:", os.path.basename(train_dataset.image_files[i]))
    print("Bounding box:", bbox)
    print("License plate string:", license_plate_string)
    print("Character indices tensor:", char_indices_tensor)
    imshow(image)

print("\n\n---*** EVALUATION DATASET ***---")
for i in range(num_samples_to_check):
    image, bbox, license_plate_string, char_indices_tensor = val_dataset[i]

    print(f"\n--- Sample {i} ---")
    print("Original filename:", os.path.basename(val_dataset.image_files[i]))
    print("Bounding box:", bbox)
    print("License plate string:", license_plate_string)
    print("Character indices tensor:", char_indices_tensor)
    imshow(image)

# Get a single batch from the dataloader
# for images, bboxes, license_plates, char_indices_tensor in train_dataloader:
#     print("\n--- First Batch Labels ---")
#     print("Bounding box batch:", bboxes)
#     print("License plate batch:", license_plates)
#     print("Character indices batch tensor:", char_indices_tensor)
#     # Display the first image in the batch
#     if len(images) > 0:
#         print("Displaying the first image in the batch:")
#         imshow(images[0])

#     # Break after the first batch
#     break




---*** TRAINING DATASET ***---

--- Sample 0 ---
Original filename: 0111494252874-90_82-268&522_482&587-498&599_269&588_247&509_476&520-0_0_11_32_32_30_17-155-33.jpg
Bounding box: tensor([0.30314, 0.42485, 0.62072, 0.49702])
License plate string: 皖AM886T
Character indices tensor: tensor([ 0,  0, 11, 32, 32, 30, 17])

--- Sample 1 ---
Original filename: 019838362069-90_74-212&484_511&569-510&570_221&555_191&470_480&485-0_0_21_27_1_31_31-75-29.jpg
Bounding box: tensor([0.29365, 0.40129, 0.68436, 0.50042])
License plate string: 皖AX3B77
Character indices tensor: tensor([ 0,  0, 21, 27,  1, 31, 31])

--- Sample 2 ---
Original filename: 0125778256705-89_86-230&561_421&629-421&625_226&625_223&561_418&561-0_0_1_1_31_29_32-143-79.jpg
Bounding box: tensor([0.29853, 0.47563, 0.58324, 0.55979])
License plate string: 皖ABB758
Character indices tensor: tensor([ 0,  0,  1,  1, 31, 29, 32])

--- Sample 3 ---
Original filename: 0316594827586-103_78-258&506_507&645-500&645_250&579_256&498_506&564-0_0_7

# Baseline model (detection)

In [None]:
# Clone the YOLOv5 repository
!git clone https://github.com/ultralytics/yolov5
# Install required packages
subprocess.run(["pip", "install", "-r", f"{BASE_DIR}/yolov5/requirements.txt"])

Cloning into 'yolov5'...
remote: Enumerating objects: 17511, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 17511 (delta 5), reused 0 (delta 0), pack-reused 17493 (from 3)[K
Receiving objects: 100% (17511/17511), 16.65 MiB | 28.56 MiB/s, done.
Resolving deltas: 100% (11996/11996), done.


CompletedProcess(args=['pip', 'install', '-r', '/content/yolov5/requirements.txt'], returncode=0)

In [None]:
# Load a YOLOv5 model (options: yolov5n, yolov5s, yolov5m, yolov5l, yolov5x)
model = torch.hub.load("ultralytics/yolov5", YOLO_MODEL)  # Default: yolov5s

"""
# Define the input image source (URL, local file, PIL image, OpenCV frame, numpy array, or list)
img = "https://ultralytics.com/images/zidane.jpg"  # Example image

# Perform inference (handles batching, resizing, normalization automatically)
results = model(img)

# Process the results (options: .print(), .show(), .save(), .crop(), .pandas())
results.print()  # Print results to console
results.show()  # Display results in a window
results.save()  # Save results to runs/detect/exp
"""

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
YOLOv5 🚀 2025-7-13 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:00<00:00, 176MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


'\n# Define the input image source (URL, local file, PIL image, OpenCV frame, numpy array, or list)\nimg = "https://ultralytics.com/images/zidane.jpg"  # Example image\n\n# Perform inference (handles batching, resizing, normalization automatically)\nresults = model(img)\n\n# Process the results (options: .print(), .show(), .save(), .crop(), .pandas())\nresults.print()  # Print results to console\nresults.show()  # Display results in a window\nresults.save()  # Save results to runs/detect/exp\n'

# Old baseline model

In [None]:
class LicensePlateDetector(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, stride=1, padding=1),   # 224x224 → 224x224
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),                            # 112x112

            nn.Conv2d(32, 64, 3, stride=1, padding=1),  # 112x112
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),                            # 56x56

            nn.Conv2d(64, 128, 3, stride=1, padding=1), # 56x56
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),                            # 28x28

            nn.Conv2d(128, 256, 3, stride=1, padding=1),# 28x28
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)                     # 1x1
        )
        self.regressor = nn.Linear(256, 4)  # x1,y1,x2,y2

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.regressor(x)
        x = torch.sigmoid(x)
        return x


# Training Loop (detection)

In [None]:
def export_to_yolo(subset_dataset, output_dir):
    image_dir = os.path.join(output_dir, 'images')
    label_dir = os.path.join(output_dir, 'labels')
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(label_dir, exist_ok=True)

    for i in tqdm(range(len(subset_dataset)), desc=f"Exporting to {output_dir}"):
        image, bbox, _, _ = subset_dataset[i]

        # Get original path
        if hasattr(subset_dataset, 'image_files'):
            img_path = subset_dataset.image_files[i]
        elif hasattr(subset_dataset, 'base_dataset') and hasattr(subset_dataset.base_dataset, 'image_files'):
            img_path = subset_dataset.base_dataset.image_files[i]
        else:
            raise ValueError("Cannot locate image path")

        # Save image to new folder
        new_img_path = os.path.join(image_dir, os.path.basename(img_path))
        shutil.copyfile(img_path, new_img_path)

        # Convert bbox to YOLO format (x_center, y_center, width, height)
        img_pil = Image.open(img_path).convert('RGB')
        img_w, img_h = img_pil.size

        x1 = bbox[0].item() * img_w
        y1 = bbox[1].item() * img_h
        x2 = bbox[2].item() * img_w
        y2 = bbox[3].item() * img_h

        x_center = (x1 + x2) / 2 / img_w
        y_center = (y1 + y2) / 2 / img_h
        width = (x2 - x1) / img_w
        height = (y2 - y1) / img_h

        # Save label
        label_filename = os.path.basename(img_path).replace('.jpg', '.txt')
        label_path = os.path.join(label_dir, label_filename)

        with open(label_path, 'w') as f:
            f.write(f"0 {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")


## Use the whole dataset

In [None]:
export_to_yolo(train_dataset)
export_to_yolo(val_dataset)

In [None]:
# Create data file
ccpd_yaml_path = f"{BASE_DIR}/ccpd.yaml"

with open(ccpd_yaml_path, 'w') as f:
    f.write(f"""train: {BASE_DIR}/CCPD2019/ccpd_train
val: {BASE_DIR}/CCPD2019/ccpd_val

nc: 1
names: ['license_plate']
""")

## OR use a subset

In [None]:
# Export labels
export_to_yolo(train_dataset, "ccpd_subset/train")
export_to_yolo(val_dataset, "ccpd_subset/val")

Exporting to ccpd_subset/train: 100%|██████████| 1000/1000 [00:20<00:00, 49.77it/s]
Exporting to ccpd_subset/val: 100%|██████████| 1000/1000 [00:15<00:00, 63.62it/s]


In [None]:
ccpd_yaml_path = f"{BASE_DIR}/ccpd_subset.yaml"

with open(ccpd_yaml_path, 'w') as f:
    f.write(f"""train: {BASE_DIR}/ccpd_subset/train
val: {BASE_DIR}/ccpd_subset/val

nc: 1
names: ['license_plate']
""")


## Train

In [None]:
# Train
subprocess.run([
    "python",
    "-W", "ignore::FutureWarning",
    "-W", "ignore::DeprecationWarning",
    f"{BASE_DIR}/yolov5/train.py",
    "--img", "416",
    "--batch", f"{BATCH_SIZE}",
    "--epochs", f"{NUM_EPOCHS}",
    "--data", f"{ccpd_yaml_path}",
    "--weights", f"{YOLO_MODEL}.pt",
    "--name", "ccpd_yolo_baseline"
])



# Old training loop

In [None]:
class GIoULoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pred_boxes, target_boxes):
        # pred_boxes and target_boxes are [B,4] in normalized coords: x1,y1,x2,y2

        x1_p, y1_p, x2_p, y2_p = pred_boxes[:,0], pred_boxes[:,1], pred_boxes[:,2], pred_boxes[:,3]
        x1_t, y1_t, x2_t, y2_t = target_boxes[:,0], target_boxes[:,1], target_boxes[:,2], target_boxes[:,3]

        x1_i = torch.max(x1_p, x1_t)
        y1_i = torch.max(y1_p, y1_t)
        x2_i = torch.min(x2_p, x2_t)
        y2_i = torch.min(y2_p, y2_t)

        inter_area = (x2_i - x1_i).clamp(0) * (y2_i - y1_i).clamp(0)
        area_p = (x2_p - x1_p).clamp(0) * (y2_p - y1_p).clamp(0)
        area_t = (x2_t - x1_t).clamp(0) * (y2_t - y1_t).clamp(0)

        union_area = area_p + area_t - inter_area + 1e-7
        iou = inter_area / union_area

        # enclosing box
        x1_c = torch.min(x1_p, x1_t)
        y1_c = torch.min(y1_p, y1_t)
        x2_c = torch.max(x2_p, x2_t)
        y2_c = torch.max(y2_p, y2_t)
        area_c = (x2_c - x1_c) * (y2_c - y1_c) + 1e-7

        giou = iou - (area_c - union_area) / area_c
        loss = 1 - giou

        return loss.mean()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

model = LicensePlateDetector().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
#criterion = nn.MSELoss()
#criterion = nn.SmoothL1Loss()
criterion = GIoULoss()

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for i, (images, bboxes, _, _) in enumerate(progress_bar):
        #print("images.shape:", images.shape)
        #print("bboxes.shape:", bboxes.shape)
        images = images.to(device)
        bboxes = bboxes.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, bboxes)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / (i + 1))
    progress_bar.close()

    scheduler.step()
    """
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for images, bboxes, _, _ in val_dataloader:
            images = images.to(device)
            bboxes = bboxes.to(device)
            outputs = model(images)
            vloss = criterion(outputs, bboxes)
            val_loss += vloss.item()
        val_loss /= len(val_dataloader)
    print(f"Validation loss: {val_loss:.4f}")

    print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(train_dataloader):.4f}")
    """


Device: cuda


Epoch 1:   0%|          | 0/32 [00:00<?, ?it/s]


ValueError: Expected x_min for bbox [      -46.5       197.5       498.5       778.5           0] to be in the range [0.0, 1.0], got -46.5.

# Evaluation (detection)

In [None]:
subprocess.run([
    "python",
    f"{BASE_DIR}/yolov5/val.py",                # path to val.py
    "--data", f"{ccpd_yaml_path}",          # path to your dataset config
    "--weights", "yolov5/runs/train/ccpd_yolo_baseline11/weights/best.pt",  # trained model
    "--img", "416",                             # image size
    "--task", "val"                             # or "test" if evaluating on test set
])


# Old evaluation

In [None]:
def compute_iou(box1, box2):
    # box1, box2 are tensors or lists: [x1, y1, x2, y2] normalized coords (0-1)
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)

    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    union_area = box1_area + box2_area - inter_area

    if union_area == 0:
        return 0.0
    else:
        return inter_area / union_area

def evaluate_model(model, dataloader, device, iou_threshold=IOU_THRESHOLD): # threshold in the article is 0.7 (was 0.5 before)
    model.eval()
    total_iou = 0
    total_samples = 0
    correct_detections = 0

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating")
        for images, bboxes, _, _ in progress_bar:
            images = images.to(device)
            bboxes = bboxes.to(device)

            outputs = model(images)
            # outputs shape: [batch_size, 4], predicted bounding boxes

            for pred_box, gt_box in zip(outputs, bboxes):
                iou = compute_iou(pred_box.cpu().numpy(), gt_box.cpu().numpy())
                total_iou += iou
                total_samples += 1
                if iou >= iou_threshold:
                    correct_detections += 1
            #print("predicted box:", outputs[0].cpu().numpy())
            #print("ground truth:", bboxes[0].cpu().numpy())

    avg_iou = total_iou / total_samples if total_samples > 0 else 0
    accuracy = correct_detections / total_samples if total_samples > 0 else 0
    progress_bar.set_postfix(avg_iou=f"{avg_iou:.4f}", accuracy=f"{accuracy:.4f}")

    print(f"\nAverage IoU: {avg_iou:.4f}")
    print(f"Detection accuracy (IoU >= {iou_threshold}): {accuracy:.4f}")


In [None]:
evaluate_model(model, val_dataloader, device)

Evaluating: 100%|█████████████████████████████████████████████████████████| 313/313 [02:09<00:00,  2.41it/s]


Average IoU: 0.3699
Detection accuracy (IoU >= 0.5): 0.3237





## Visualize evaluation results (detection)

In [None]:
def visualize_batch_predictions(images, gt_bboxes, pred_bboxes, max_samples=4):
    # images: batch tensor [B, 3, H, W]
    # gt_bboxes, pred_bboxes: batch tensors [B, 4] with normalized coords
    # max_samples: number of images to display

    batch_size = min(len(images), max_samples)

    for i in range(batch_size):
        visualize_prediction(images[i], gt_bboxes[i], pred_bboxes[i])

def visualize_random_predictions(images, gt_bboxes, pred_bboxes, max_samples=4):
    batch_size = len(images)
    indices = random.sample(range(batch_size), min(max_samples, batch_size))

    for i in indices:
        visualize_prediction(images[i], gt_bboxes[i], pred_bboxes[i])

def visualize_prediction(image_tensor, gt_bbox, pred_bbox):
    # image_tensor: tensor [3, H, W], normalized
    # gt_bbox, pred_bbox: normalized [x1, y1, x2, y2]

    mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)

    image = image_tensor * std + mean
    image = image.clamp(0, 1).permute(1, 2, 0).cpu().numpy()

    fig, ax = plt.subplots(1)
    ax.imshow(image)

    h, w, _ = image.shape

    # Convert normalized bboxes to absolute pixel coords
    gt_x1, gt_y1, gt_x2, gt_y2 = gt_bbox
    pred_x1, pred_y1, pred_x2, pred_y2 = pred_bbox

    gt_rect = patches.Rectangle((gt_x1*w, gt_y1*h), (gt_x2 - gt_x1)*w, (gt_y2 - gt_y1)*h,
                               linewidth=2, edgecolor='g', facecolor='none', label='Ground Truth')
    pred_rect = patches.Rectangle((pred_x1*w, pred_y1*h), (pred_x2 - pred_x1)*w, (pred_y2 - pred_y1)*h,
                                 linewidth=2, edgecolor='r', facecolor='none', label='Prediction')

    ax.add_patch(gt_rect)
    ax.add_patch(pred_rect)
    plt.legend()
    plt.show()


In [None]:
images, gt_bboxes, _, _ = next(iter(val_dataloader))
images = images.to(device)

model.eval()
with torch.no_grad():
    pred_bboxes = model(images).cpu()

visualize_random_predictions(images.cpu(), gt_bboxes.cpu(), pred_bboxes, max_samples=4)

NameError: name 'device' is not defined

# Perform detection

In [None]:
YOLO_WEIGHTS_PATH = f'{BASE_DIR}/yolov5/runs/train/ccpd_yolo_baseline2/weights/best.pt'
# Load the trained YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'custom', path=YOLO_WEIGHTS_PATH)
model.conf = 0.25  # confidence threshold (you can adjust)

def detect_plates(image_path):
    results = model(image_path)
    detections = results.xyxy[0]  # [x1, y1, x2, y2, conf, class]

    # Load image as numpy
    image = cv2.imread(image_path)
    cropped_images = []

    for *box, conf, cls in detections:
        x1, y1, x2, y2 = map(int, box)
        cropped = image[y1:y2, x1:x2]
        cropped_images.append((cropped, (x1, y1, x2, y2)))

    return cropped_images  # list of (cropped_image, bbox)


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2025-7-13 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


# Baseline model (recognition)

Step 1: Data Preparation for the Recognition Model We need to feed the recognition model only the license plate part of the image. We'll use the ground truth bounding boxes to crop these out. First, a helper function to perform the cropping. The bounding boxes from CCPDDataset are normalized [x1, y1, x2, y2]. We'll need to denormalize them to pixel coordinates relative to the 416x416 input image and then crop.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def crop_license_plate(image_tensor_batch, gt_bboxes_batch, target_size=(64, 128)):
    """
    Crops license plates from a batch of image tensors using ground truth bounding boxes.

    Args:
        image_tensor_batch (torch.Tensor): Batch of images, shape (B, C, H, W).
                                           Assumed to be normalized if model expects normalized input.
        gt_bboxes_batch (torch.Tensor): Batch of normalized bounding boxes (x1, y1, x2, y2),
                                        shape (B, 4). Coordinates are relative to image_tensor_batch.
        target_size (tuple): Desired (height, width) for the cropped plate.

    Returns:
        torch.Tensor: Batch of cropped and resized license plate images.
    """
    cropped_plates = []
    _, _, H, W = image_tensor_batch.shape

    for i in range(image_tensor_batch.size(0)):
        img = image_tensor_batch[i]  # Single image (C, H, W)
        bbox = gt_bboxes_batch[i]    # Single bbox (4)

        # Denormalize bbox coordinates
        x1 = int(bbox[0] * W)
        y1 = int(bbox[1] * H)
        x2 = int(bbox[2] * W)
        y2 = int(bbox[3] * H)

        # Ensure coordinates are within image bounds and valid
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(W, x2)
        y2 = min(H, y2)

        crop_width = x2 - x1
        crop_height = y2 - y1

        if crop_width <= 0 or crop_height <= 0:
            # Handle invalid bbox, e.g., create a black image or skip
            # For simplicity, create a black image of target_size
            print(f"Warning: Invalid bbox {bbox} resulted in zero/negative crop size. Using black image.")
            plate_crop = torch.zeros((img.size(0), target_size[0], target_size[1]), device=img.device)
        else:
            # Crop using torchvision.transforms.functional.crop
            # crop expects (top, left, height, width)
            plate_crop = TF.crop(img, top=y1, left=x1, height=crop_height, width=crop_width)

        # Resize the cropped plate to the target size
        plate_crop_resized = TF.resize(plate_crop, target_size, antialias=True) # antialias for newer torchvision
        cropped_plates.append(plate_crop_resized)

    return torch.stack(cropped_plates)

Step 2: Design the Recognition Model (LicensePlateRecognizer) This model will take the cropped license plate image (e.g., 64x128 pixels) and output predictions for each of the 7 character positions. Provinces: Chinese characters representing provinces. Alphabets: A single letter. Ads: Alphanumeric characters (digits and letters, excluding 'I' and 'O' sometimes, but CCPD includes 'O'). The CCPDDataset already defines these lists: self.provinces (34 classes, including 'O' for other/unknown) self.alphabets (25 classes, including 'O') self.ads (35 classes: A-Z excluding I, plus 0-9, plus 'O') Our model will have a shared CNN backbone to extract features from the plate, and then separate fully connected "heads" to predict each character.

In [None]:
class LicensePlateRecognizer(nn.Module):
    def __init__(self, num_provinces, num_alphabets, num_ads, input_height=64, input_width=128):
        super().__init__()
        self.num_provinces = num_provinces
        self.num_alphabets = num_alphabets
        self.num_ads = num_ads

        # CNN Backbone
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), # Keep size: Bx32x64x128
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),      # Bx32x32x64

            nn.Conv2d(32, 64, kernel_size=3, padding=1), # Bx64x32x64
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),      # Bx64x16x32

            nn.Conv2d(64, 128, kernel_size=3, padding=1),# Bx128x16x32
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2,2)), # Bx128x8x16

            nn.Conv2d(128, 256, kernel_size=3, padding=1),# Bx256x8x16
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2,1))  # Bx256x4x16, trying to keep more width
        )

        # Calculate the flattened size after conv layers
        # For input 64x128:
        # After MaxPool1: 32x64
        # After MaxPool2: 16x32
        # After MaxPool3: 8x16
        # After MaxPool4: 4x16
        # So, flattened_size = 256 * 4 * 16
        self.flattened_size = 256 * (input_height // 16) * (input_width // 8) # Generic calculation

        # Fully Connected layers for each character
        # These are "heads" for each position
        self.fc_province = nn.Linear(self.flattened_size, num_provinces)
        self.fc_alphabet = nn.Linear(self.flattened_size, num_alphabets)

        # We need 5 'ads' characters
        self.fc_ad1 = nn.Linear(self.flattened_size, num_ads)
        self.fc_ad2 = nn.Linear(self.flattened_size, num_ads)
        self.fc_ad3 = nn.Linear(self.flattened_size, num_ads)
        self.fc_ad4 = nn.Linear(self.flattened_size, num_ads)
        self.fc_ad5 = nn.Linear(self.flattened_size, num_ads)

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten

        # Get predictions for each character
        out_province = self.fc_province(x)
        out_alphabet = self.fc_alphabet(x)
        out_ad1 = self.fc_ad1(x)
        out_ad2 = self.fc_ad2(x)
        out_ad3 = self.fc_ad3(x)
        out_ad4 = self.fc_ad4(x)
        out_ad5 = self.fc_ad5(x)

        # Return as a list of tensors or stack them
        # Stacking makes it easier to handle later: (Batch, NumChars, NumClassesPerCharType - not quite)
        # Better to return a list, as class numbers differ
        return [out_province, out_alphabet, out_ad1, out_ad2, out_ad3, out_ad4, out_ad5]

# Training (recognition)

Step 3: Define the Loss Function We'll use nn.CrossEntropyLoss for each of the 7 character predictions and sum them up.

In [None]:
recognition_criterion = nn.CrossEntropyLoss()

def calculate_recognition_loss(predictions_list, char_indices_batch):
    """
    Calculates the total cross-entropy loss for character recognition.

    Args:
        predictions_list (list of torch.Tensor): List of 7 tensors, where each tensor
                                                 has shape (Batch_size, Num_classes_for_that_char_pos).
                                                 These are raw logits from the model.
        char_indices_batch (torch.Tensor): Ground truth character indices, shape (Batch_size, 7).

    Returns:
        torch.Tensor: The total summed loss.
    """
    total_loss = 0
    # char_indices_batch is (Batch, 7)
    # predictions_list contains 7 tensors, e.g., predictions_list[0] is (Batch, num_provinces)

    for i in range(7): # For each character position
        # predictions_list[i] are the logits for the i-th character for all items in batch
        # char_indices_batch[:, i] are the ground truth indices for the i-th char for all items in batch
        loss = recognition_criterion(predictions_list[i], char_indices_batch[:, i])
        total_loss += loss

    return total_loss

Step 4: Training Loop for the Recognition Model

In [None]:
#RECOGNITION_MODEL_PATH = f"{BASE_DIR}/license_plate_recognizer.pth"
RECOGNITION_MODEL_PATH = f"{BASE_DIR}/recognition_model.pth"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get class counts from the dataset instance (make sure train_dataset is initialized)
# If train_dataset is not yet fully initialized or accessible here, you might need to hardcode or pass these
# For example, use the actual train_dataset instance
num_provinces = len(train_dataset.provinces)
num_alphabets = len(train_dataset.alphabets)
num_ads = len(train_dataset.ads)

recognition_model = LicensePlateRecognizer(
    num_provinces=num_provinces,
    num_alphabets=num_alphabets,
    num_ads=num_ads,
    input_height=64, # Target size for cropped plates
    input_width=128  # Target size for cropped plates
).to(device)

recognition_optimizer = optim.Adam(recognition_model.parameters(), lr=1e-4) # Start with a smaller LR

# --- Training Configuration ---
RECOGNITION_EPOCHS = 10 # Adjust as needed
PLATE_CROP_TARGET_SIZE = (64, 128) # (height, width) for cropped plates

print(f"Starting recognition model training on {device}...")
print(f"Num provinces: {num_provinces}, Num alphabets: {num_alphabets}, Num ads: {num_ads}")


for epoch in range(RECOGNITION_EPOCHS):
    recognition_model.train()
    epoch_rec_loss = 0

    # Use the same train_dataloader as for detection, but we'll process data differently
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{RECOGNITION_EPOCHS} (Recognition)")

    for i, (images_batch, gt_bboxes_batch, _, char_indices_batch) in enumerate(progress_bar):
        images_batch = images_batch.to(device)       # Full images (B, C, 416, 416)
        gt_bboxes_batch = gt_bboxes_batch.to(device) # Normalized GT bboxes (B, 4)
        char_indices_batch = char_indices_batch.to(device) # GT char indices (B, 7)

        # 1. Crop license plates using GT bounding boxes
        # The 'images_batch' from dataloader are already transformed (resized, ToTensor, Normalized)
        cropped_plates_batch = crop_license_plate(images_batch, gt_bboxes_batch, target_size=PLATE_CROP_TARGET_SIZE)
        # cropped_plates_batch should be (B, C, PLATE_CROP_TARGET_SIZE[0], PLATE_CROP_TARGET_SIZE[1])

        # 2. Forward pass through recognition model
        recognition_optimizer.zero_grad()
        # Ensure cropped_plates_batch are on the correct device (crop_license_plate should handle it if img.device is used)
        predictions_list = recognition_model(cropped_plates_batch.to(device))

        # 3. Calculate loss
        rec_loss = calculate_recognition_loss(predictions_list, char_indices_batch)

        # 4. Backward pass and optimize
        rec_loss.backward()
        recognition_optimizer.step()

        epoch_rec_loss += rec_loss.item()
        progress_bar.set_postfix(rec_loss=epoch_rec_loss / (i + 1))

    avg_epoch_rec_loss = epoch_rec_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Recognition Training: Avg. Loss: {avg_epoch_rec_loss:.4f}")

# Save the trained recognition model
print(f"Saving recognition model to {RECOGNITION_MODEL_PATH}")
torch.save(recognition_model.state_dict(), RECOGNITION_MODEL_PATH)
print("Recognition model saved.")

print("Recognition model training finished.")

Starting recognition model training on cuda...
Num provinces: 34, Num alphabets: 25, Num ads: 35


Epoch 1/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.10it/s, rec_loss=18.5]


Epoch 1 Recognition Training: Avg. Loss: 18.5122


Epoch 2/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.09it/s, rec_loss=16.3]


Epoch 2 Recognition Training: Avg. Loss: 16.2975


Epoch 3/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.06it/s, rec_loss=16.2]


Epoch 3 Recognition Training: Avg. Loss: 16.2088


Epoch 4/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.13it/s, rec_loss=16.1]


Epoch 4 Recognition Training: Avg. Loss: 16.1489


Epoch 5/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.09it/s, rec_loss=16.1]


Epoch 5 Recognition Training: Avg. Loss: 16.0797


Epoch 6/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.12it/s, rec_loss=16]


Epoch 6 Recognition Training: Avg. Loss: 16.0083


Epoch 7/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.01it/s, rec_loss=16]


Epoch 7 Recognition Training: Avg. Loss: 15.9513


Epoch 8/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.08it/s, rec_loss=15.9]


Epoch 8 Recognition Training: Avg. Loss: 15.9039


Epoch 9/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.11it/s, rec_loss=15.9]


Epoch 9 Recognition Training: Avg. Loss: 15.8607


Epoch 10/10 (Recognition): 100%|██████████| 32/32 [00:15<00:00,  2.08it/s, rec_loss=15.8]

Epoch 10 Recognition Training: Avg. Loss: 15.7740
Saving recognition model to /content/recognition_model.pth
Recognition model saved.
Recognition model training finished.





# Evaluation (recognition)

Step 5: Evaluation for the Recognition Model We need to see how well it's doing on the validation set. We'll calculate: Per-Character Accuracy: For each of the 7 positions, how often is the character correct? Full Plate Accuracy: How often is the entire 7-character plate string predicted correctly?

In [None]:
# Create validation dataloader if not already done
# Ensure val_dataset is created with the same transforms and uses the val split file
# val_dataset = CCPDDataset(data_dir=VAL_DATASET, split_file=VAL_SPLIT_FILE, transform=transform)

In [None]:
#val_dataset_small_rec = SmallSubsetDataset(val_dataset, max_samples=NUM_SAMPLES) # Smaller subset for faster eval
#val_dataloader_rec = DataLoader(val_dataset_small_rec, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

In [None]:
"""
def evaluate_recognition_model(model, dataloader, device, dataset_instance, crop_target_size):
    model.eval()
    total_samples = 0

    # For per-character accuracy
    correct_chars_counts = [0] * 7 # One counter for each of the 7 character positions
    total_chars_counts = [0] * 7

    # For full plate accuracy
    correct_full_plates = 0

    # To store some example predictions
    example_predictions = []
    max_examples_to_show = 5

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating Recognition Model")
        for images_batch, gt_bboxes_batch, gt_plate_strings_batch, char_indices_batch in progress_bar:
            images_batch = images_batch.to(device)
            gt_bboxes_batch = gt_bboxes_batch.to(device)
            char_indices_batch = char_indices_batch.to(device) # (B, 7)

            # 1. Crop plates
            cropped_plates_batch = crop_license_plate(images_batch, gt_bboxes_batch, target_size=crop_target_size)

            # 2. Get predictions
            predictions_list = model(cropped_plates_batch.to(device)) # List of 7 tensors (B, Num_Classes)

            batch_size = char_indices_batch.size(0)
            total_samples += batch_size

            # Process predictions for the batch
            # predicted_indices_batch will be (B, 7)
            predicted_indices_batch = torch.zeros_like(char_indices_batch)
            for char_pos in range(7):
                # predictions_list[char_pos] is (B, Num_Classes_for_this_pos)
                # Take argmax to get the predicted class index
                predicted_indices_batch[:, char_pos] = torch.argmax(predictions_list[char_pos], dim=1)

            # 3. Compare with ground truth
            for i in range(batch_size): # Iterate over samples in the batch
                gt_indices_sample = char_indices_batch[i] # (7)
                pred_indices_sample = predicted_indices_batch[i] # (7)

                is_full_plate_correct = True
                for char_pos in range(7):
                    total_chars_counts[char_pos] += 1
                    if gt_indices_sample[char_pos] == pred_indices_sample[char_pos]:
                        correct_chars_counts[char_pos] += 1
                    else:
                        is_full_plate_correct = False

                if is_full_plate_correct:
                    correct_full_plates += 1

                # Store some examples
                if len(example_predictions) < max_examples_to_show:
                    gt_str = dataset_instance._indices_to_string(gt_indices_sample.cpu().tolist())
                    pred_str = dataset_instance._indices_to_string(pred_indices_sample.cpu().tolist())
                    example_predictions.append({"gt": gt_str, "pred": pred_str, "correct": is_full_plate_correct})

    # Calculate accuracies
    per_char_accuracies = [(correct_chars_counts[j] / total_chars_counts[j] if total_chars_counts[j] > 0 else 0) for j in range(7)]
    full_plate_accuracy = correct_full_plates / total_samples if total_samples > 0 else 0

    print("\n--- Recognition Evaluation Results ---")
    for j in range(7):
        print(f"Character Position {j+1} Accuracy: {per_char_accuracies[j]:.4f}")
    print(f"Full License Plate Accuracy: {full_plate_accuracy:.4f}")

    print("\n--- Example Predictions ---")
    for ex in example_predictions:
        print(f"GT: {ex['gt']:<10} | Pred: {ex['pred']:<10} | Correct: {ex['correct']}")

    return per_char_accuracies, full_plate_accuracy
"""

In [None]:
"""
print("\nEvaluating recognition model on validation set...")
# Pass the original val_dataset instance for _indices_to_string method
# and the crop_target_size used during training/inference
evaluate_recognition_model(recognition_model, val_dataloader, device, val_dataset, PLATE_CROP_TARGET_SIZE)
"""

Same evaluation but with images

In [None]:
def unnormalize_image(tensor_image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
    """Unnormalizes a tensor image."""
    # Create tensors for mean and std if they are not already
    if not isinstance(mean, torch.Tensor):
        mean = torch.tensor(mean, device=tensor_image.device).view(3, 1, 1)
    if not isinstance(std, torch.Tensor):
        std = torch.tensor(std, device=tensor_image.device).view(3, 1, 1)

    unnormalized_image = tensor_image * std + mean
    unnormalized_image = torch.clamp(unnormalized_image, 0, 1) # Clip to [0, 1] range
    return unnormalized_image

def evaluate_recognition_model(model, dataloader, device, dataset_instance, crop_target_size, num_examples_to_show=5):
    model.eval()
    total_samples = 0

    # For per-character accuracy
    correct_chars_counts = [0] * 7 # One counter for each of the 7 character positions
    total_chars_counts = [0] * 7

    # For full plate accuracy
    correct_full_plates = 0

    # To store some example predictions with images
    example_predictions_with_images = []

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating Recognition Model")
        for images_batch, gt_bboxes_batch, gt_plate_strings_batch, char_indices_batch in progress_bar:
            images_batch = images_batch.to(device)
            gt_bboxes_batch = gt_bboxes_batch.to(device)
            char_indices_batch = char_indices_batch.to(device) # (B, 7)

            # 1. Crop plates
            cropped_plates_batch = crop_license_plate(images_batch, gt_bboxes_batch, target_size=crop_target_size)
            # cropped_plates_batch is (B, C, H_crop, W_crop) and should be on `device`

            # 2. Get predictions
            predictions_list = model(cropped_plates_batch) # List of 7 tensors (B, Num_Classes)

            batch_size = char_indices_batch.size(0)
            total_samples += batch_size

            # Process predictions for the batch
            predicted_indices_batch = torch.zeros_like(char_indices_batch)
            for char_pos in range(7):
                predicted_indices_batch[:, char_pos] = torch.argmax(predictions_list[char_pos], dim=1)

            # 3. Compare with ground truth
            for i in range(batch_size): # Iterate over samples in the batch
                gt_indices_sample = char_indices_batch[i] # (7)
                pred_indices_sample = predicted_indices_batch[i] # (7)

                is_full_plate_correct = True
                for char_pos in range(7):
                    total_chars_counts[char_pos] += 1
                    if gt_indices_sample[char_pos] == pred_indices_sample[char_pos]:
                        correct_chars_counts[char_pos] += 1
                    else:
                        is_full_plate_correct = False

                if is_full_plate_correct:
                    correct_full_plates += 1

                # Store some examples with their cropped images
                if len(example_predictions_with_images) < num_examples_to_show:
                    gt_str = dataset_instance._indices_to_string(gt_indices_sample.cpu().tolist())
                    pred_str = dataset_instance._indices_to_string(pred_indices_sample.cpu().tolist())

                    # Get the corresponding cropped plate image (move to CPU for plotting)
                    # cropped_plates_batch[i] is (C, H_crop, W_crop)
                    plate_img_tensor = cropped_plates_batch[i].cpu()

                    example_predictions_with_images.append({
                        "image": plate_img_tensor, # Store the tensor
                        "gt": gt_str,
                        "pred": pred_str,
                        "correct": is_full_plate_correct
                    })

    # Calculate accuracies
    per_char_accuracies = [(correct_chars_counts[j] / total_chars_counts[j] if total_chars_counts[j] > 0 else 0) for j in range(7)]
    full_plate_accuracy = correct_full_plates / total_samples if total_samples > 0 else 0

    print("\n--- Recognition Evaluation Results ---")
    for j in range(7):
        print(f"Character Position {j+1} Accuracy: {per_char_accuracies[j]:.4f}")
    print(f"Full License Plate Accuracy: {full_plate_accuracy:.4f}")

    print("\n--- Example Predictions with Images ---")
    if not example_predictions_with_images:
        print("No examples to show.")
    else:
        # Determine number of rows and columns for subplot
        num_examples = len(example_predictions_with_images)
        cols = min(num_examples, 3) # Max 3 columns
        rows = (num_examples + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 3)) # Adjust figsize as needed
        if num_examples == 1: # Handle case of single example for subplot indexing
            axes = np.array([axes])
        axes = axes.flatten() # Flatten to make indexing easier

        for idx, ex in enumerate(example_predictions_with_images):
            ax = axes[idx]
            # Unnormalize and permute for display (C, H, W) -> (H, W, C)
            img_to_show = unnormalize_image(ex['image']).permute(1, 2, 0).numpy()

            ax.imshow(img_to_show)
            title_color = 'green' if ex['correct'] else 'red'
            ax.set_title(f"GT: {ex['gt']}\nPred: {ex['pred']}", color=title_color, fontsize=10)
            ax.axis('off')

        # Hide any unused subplots
        for i in range(num_examples, len(axes)):
            axes[i].axis('off')

        plt.tight_layout()
        plt.show()

    return per_char_accuracies, full_plate_accuracy

In [None]:
# Assuming all necessary variables are defined (recognition_model, val_dataloader_rec, device, val_dataset, PLATE_CROP_TARGET_SIZE)

print("\nEvaluating recognition model on validation set with image examples...")
evaluate_recognition_model(
    recognition_model,
    val_dataloader_rec,
    device,
    val_dataset, # Pass the CCPDDataset instance (e.g., val_dataset or val_dataset_small_rec.base_dataset if val_dataset_small_rec is a SmallSubsetDataset)
    PLATE_CROP_TARGET_SIZE,
    num_examples_to_show=10 # Or any number you prefer
)


Evaluating recognition model on validation set with image examples...


Evaluating Recognition Model: 100%|██████████| 32/32 [00:12<00:00,  2.48it/s]



--- Recognition Evaluation Results ---
Character Position 1 Accuracy: 0.9620
Character Position 2 Accuracy: 0.9180
Character Position 3 Accuracy: 0.0340
Character Position 4 Accuracy: 0.0850
Character Position 5 Accuracy: 0.0930
Character Position 6 Accuracy: 0.1220
Character Position 7 Accuracy: 0.0980
Full License Plate Accuracy: 0.0000

--- Example Predictions with Images ---


([0.962, 0.918, 0.034, 0.085, 0.093, 0.122, 0.098], 0.0)

# YOLOv5 + PDLPR

In [None]:
#FOR TESTING
NUM_SAMPLES = 10
val_dataset = CCPDDataset(data_dir=VAL_DATASET, transform=transform_val)
val_dataset = SmallSubsetDataset(val_dataset, max_samples=NUM_SAMPLES, random_subset=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
print(len(val_dataset))

Found 99996 image files in the data directory.
No split file provided, using all image files.
10


In [None]:
# Config
CONF_THRESHOLD = 0.25
IOU_THRESHOLD = 0.5
font_size = 24
font_path = FONT_PATH
save_dir = Path("results_with_ocr")
save_dir.mkdir(exist_ok=True)

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

try:
    font = ImageFont.truetype(font_path, font_size)
    print(f"Using font: {font.getname()}")
except Exception as e:
    print(f"Could not load font: {e}")
    font = ImageFont.load_default()

# Load models
detector = yolov5.load('keremberke/yolov5m-license-plate')
ocr_model = PaddleOCR(lang='ch', use_angle_cls=True)

# Clear previous results
for file in save_dir.glob("*"):
    if file.is_file():
        file.unlink()

# Metrics counters
num_correct_detections = 0
num_total_gt = 0
num_total_pred = 0
num_correct_ocr = 0

def expand_box(x1, y1, x2, y2, img_w, img_h, ratio=0.1):
    box_w = x2 - x1
    box_h = y2 - y1
    x1_new = max(0, int(x1 - ratio * box_w))
    x2_new = min(img_w, int(x2 + ratio * box_w))
    y1_new = max(0, int(y1 - ratio * box_h))
    y2_new = min(img_h, int(y2 + ratio * box_h))
    return x1_new, y1_new, x2_new, y2_new

def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2]-boxA[0]) * (boxA[3]-boxA[1])
    boxBArea = (boxB[2]-boxB[0]) * (boxB[3]-boxB[1])
    if boxAArea + boxBArea - interArea == 0:
        return 0
    return interArea / (boxAArea + boxBArea - interArea)

# Iterate over val dataloader
total_images = len(val_dataset)  # total number of images in your dataset
processed_images = 0

with tqdm(total=total_images, desc="Processing") as pbar:
    for batch in val_dataloader:
        images, gt_bboxes, gt_texts, _ = batch
        batch_size = images.size(0)
        for i in range(batch_size):
            image_tensor = images[i]  # [C,H,W]
            gt_bbox = gt_bboxes[i]
            gt_text = gt_texts[i]

            # Convert tensor to numpy BGR (denormalize if needed)
            img_np = image_tensor.permute(1, 2, 0).cpu().numpy()  # HWC RGB float32 [0,1]
            img_np = (img_np * 255).astype(np.uint8)
            img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
            h, w = img_bgr.shape[:2]

            # Convert normalized gt bbox to absolute coords
            x1_gt = int(gt_bbox[0].item() * w)
            y1_gt = int(gt_bbox[1].item() * h)
            x2_gt = int(gt_bbox[2].item() * w)
            y2_gt = int(gt_bbox[3].item() * h)
            gt_box = [x1_gt, y1_gt, x2_gt, y2_gt]
            num_total_gt += 1

            # Detection and rest of your code...
            results = detector(img_bgr)
            preds = results.pred[0].cpu().numpy()
            preds = preds[preds[:,4] >= CONF_THRESHOLD]

            pred_boxes = [pred[:4] for pred in preds]
            num_total_pred += len(pred_boxes)

            # Match prediction with gt bbox by IoU
            best_iou = 0
            matched_idx = -1
            for j, pbox in enumerate(pred_boxes):
                iou_score = iou(gt_box, pbox)
                if iou_score > best_iou:
                    best_iou = iou_score
                    matched_idx = j

            detection_correct = False
            pred_text = "unreadable"

            if best_iou >= IOU_THRESHOLD:
                num_correct_detections += 1
                detection_correct = True

                px1, py1, px2, py2 = map(int, pred_boxes[matched_idx])
                px1, py1, px2, py2 = expand_box(px1, py1, px2, py2, w, h)

                plate_crop = img_bgr[py1:py2, px1:px2]
                if plate_crop.shape[0] < 32 or plate_crop.shape[1] < 100:
                    scale_h = max(32, plate_crop.shape[0])
                    scale_w = max(100, plate_crop.shape[1])
                    plate_crop = cv2.resize(plate_crop, (scale_w, scale_h))

                plate_crop_rgb = cv2.cvtColor(plate_crop, cv2.COLOR_BGR2RGB)
                ocr_results = ocr_model.ocr(plate_crop_rgb)

                if ocr_results:
                    texts = []
                    scores = []
                    for res_dict in ocr_results:
                        texts.extend(res_dict.get('rec_texts', []))
                        scores.extend(res_dict.get('rec_scores', []))
                    if texts:
                        best_idx = np.argmax(scores)
                        pred_text = texts[best_idx]
                        pred_text_clean = pred_text.replace('·','')
                        gt_text_clean = gt_text.replace('·','')
                        if editdistance.eval(pred_text_clean, gt_text_clean) <= 1:
                            num_correct_ocr += 1

            # Visualization
            img_pil = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
            draw = ImageDraw.Draw(img_pil)

            print(f"gt_text: '{gt_text}'")

            # Draw GT box in green
            draw.rectangle(gt_box, outline="cyan", width=2)
            #draw.text((gt_box[0], max(0, gt_box[1]-30)), gt_text, font=font, fill="cyan")
            img_h, img_w = img_bgr.shape[:2]
            gt_text_y = max(0, min(gt_box[1] - 30, img_h - font_size))
            draw.text((gt_box[0], gt_text_y), gt_text, font=font, fill="cyan")

            # Draw predicted boxes in magenta
            for pbox in pred_boxes:
                pbox_int = list(map(int, pbox))
                draw.rectangle(pbox_int, outline="magenta", width=2)

            if detection_correct:
                #draw.text((px1, py2 + 5), pred_text_clean, font=font, fill="magenta")
                pred_text_y = max(0, min(py2 + 5, img_h - font_size))
                draw.text((px1, pred_text_y), pred_text_clean, font=font, fill="magenta")

            img_out = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
            # Save results
            out_path = save_dir / f"det_{num_total_gt}.jpg"
            cv2.imwrite(str(out_path), img_out)

            processed_images += 1
            pbar.update(1)


# Metrics report
precision = num_correct_detections / num_total_pred if num_total_pred > 0 else 0
recall = num_correct_detections / num_total_gt if num_total_gt > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
ocr_accuracy = num_correct_ocr / num_correct_detections if num_correct_detections > 0 else 0

print(f"\n--- Evaluation Results ---")
print(f"Detection Precision: {precision:.3f}")
print(f"Detection Recall: {recall:.3f}")
print(f"Detection F1-score: {f1:.3f}")
print(f"OCR Accuracy (on detected plates): {ocr_accuracy:.3f}")

print("DONE.")


Using font: ('Noto Sans SC', 'Thin')


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Processing:  10%|█         | 1/10 [00:02<00:19,  2.20s/it]

gt_text: '皖A0N718'


Processing:  20%|██        | 2/10 [00:03<00:15,  1.96s/it]

gt_text: '皖AMR868'


Processing:  30%|███       | 3/10 [00:06<00:14,  2.02s/it]

gt_text: '皖M33778'


Processing:  40%|████      | 4/10 [00:08<00:13,  2.19s/it]

gt_text: '皖AX6V78'


Processing:  50%|█████     | 5/10 [00:10<00:10,  2.09s/it]

gt_text: '皖AY3D16'


Processing:  60%|██████    | 6/10 [00:12<00:07,  1.95s/it]

gt_text: '皖AT1T28'


Processing:  70%|███████   | 7/10 [00:13<00:05,  1.89s/it]

gt_text: '皖AS7C86'


Processing:  80%|████████  | 8/10 [00:15<00:03,  1.91s/it]

gt_text: '皖AHH488'


Processing:  90%|█████████ | 9/10 [00:17<00:01,  1.86s/it]

gt_text: '皖AJ3S89'


Processing: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]

gt_text: '皖AJ660Z'

--- Evaluation Results ---
Detection Precision: 1.000
Detection Recall: 1.000
Detection F1-score: 1.000
OCR Accuracy (on detected plates): 0.800
DONE.





# Pipeline evaluation

In [None]:
def compute_iou(box1, box2):
    # box1, box2 are tensors or lists: [x1, y1, x2, y2] normalized coords (0-1)
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)

    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    union_area = box1_area + box2_area - inter_area

    if union_area == 0:
        return 0.0
    else:
        return inter_area / union_area

def predict_license_plate(image_path, detector_model, recognizer_model, dataset_instance, device, full_image_transform, plate_crop_target_size):
    detector_model.eval(), recognizer_model.eval()
    img = Image.open(image_path).convert('RGB')
    img_tensor = full_image_transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        predicted_bbox_batch = detector_model(img_tensor)
    predicted_bbox = predicted_bbox_batch[0].clamp(0, 1)
    cropped_plates_batch = crop_license_plate(img_tensor, predicted_bbox.unsqueeze(0), target_size=plate_crop_target_size)
    cropped_plate_image = cropped_plates_batch[0]
    with torch.no_grad():
        predictions_list = recognizer_model(cropped_plate_image.unsqueeze(0).to(device))
    predicted_indices = [torch.argmax(pred, dim=1).item() for pred in predictions_list]
    predicted_string = dataset_instance._indices_to_string(predicted_indices)
    return predicted_string, predicted_bbox.cpu(), cropped_plate_image.cpu()

def predict_license_plate_yolo(image_path, detector_model, recognizer_model, dataset_instance, device, full_image_transform, plate_crop_target_size):
    detector_model.eval()
    recognizer_model.eval()

    # --- Load and prepare image ---
    img = Image.open(image_path).convert('RGB')
    width, height = img.size

    # --- YOLOv5 Detection ---
    results = detector_model(image_path)
    detections = results.xyxy[0].cpu().numpy()

    if len(detections) == 0:
        # No detections found
        empty_bbox = torch.tensor([0, 0, 0, 0], dtype=torch.float32)
        return "", empty_bbox, None

    # Use the detection with highest confidence
    best_det = detections[detections[:, 4].argmax()]
    x1, y1, x2, y2, conf, cls = best_det
    pred_bbox_abs = torch.tensor([x1, y1, x2, y2], dtype=torch.float32)

    # --- Normalize bbox (if needed for IoU calc) ---
    pred_bbox = torch.tensor([
        x1 / width,
        y1 / height,
        x2 / width,
        y2 / height
    ], dtype=torch.float32)

    # --- Crop the license plate region ---
    transform_plate_crop = A.Compose([
        A.Resize(plate_crop_target_size[1], plate_crop_target_size[0]),  # (H, W)
        A.Normalize(mean=(0.5,), std=(0.5,)),  # o usa valori RGB se necessario
        ToTensorV2()
    ])
    plate_crop = img.crop((x1, y1, x2, y2)).resize(plate_crop_target_size)
    plate_np = np.array(plate_crop)  # Converti PIL -> NumPy
    transformed = transform_plate_crop(image=plate_np)
    plate_tensor = transformed["image"].unsqueeze(0).to(device)

    # --- Recognition ---
    with torch.no_grad():
        predictions_list = recognizer_model(plate_tensor)

    predicted_indices = [torch.argmax(pred, dim=1).item() for pred in predictions_list]
    predicted_string = dataset_instance._indices_to_string(predicted_indices)

    return predicted_string, pred_bbox, plate_tensor.cpu()

In [None]:
USE_YOLO = True

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

print("\n\n--- Starting Full Pipeline Evaluation on Validation Set ---")

# --- Evaluation Parameters ---
NUM_EVAL_SAMPLES = 2000
IOU_THRESHOLD = 0.7 # A stricter threshold for what counts as a good detection

# --- Load models and validation dataset ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if USE_YOLO:
  # Load YOLOv5 model
  loaded_detector = torch.hub.load('ultralytics/yolov5', 'custom', path=YOLO_WEIGHTS_PATH, source='github')
  loaded_detector.conf = 0.25  # adjust confidence threshold if needed

else:
  # Load Detector Model
  loaded_detector = LicensePlateDetector().to(device)
  loaded_detector.load_state_dict(torch.load(DETECTOR_MODEL_PATH, map_location=device))
  loaded_detector.eval()


# Create a validation dataset instance for evaluation
val_dataset_instance = CCPDDataset(data_dir=VAL_DATASET, transform=transform_val)

# Load Recognizer Model
loaded_recognizer = LicensePlateRecognizer(
    num_provinces=len(val_dataset_instance.provinces),
    num_alphabets=len(val_dataset_instance.alphabets),
    num_ads=len(val_dataset_instance.ads)
).to(device)
loaded_recognizer.load_state_dict(torch.load(RECOGNITION_MODEL_PATH, map_location=device))
loaded_recognizer.eval()

print(f"Models loaded. Evaluating on {NUM_EVAL_SAMPLES} samples from the validation set...")

# --- Initialize metric counters ---
iou_scores = []
correct_detections = 0
correct_recognitions = 0
total_chars = 0
correct_chars = 0
e2e_correct_matches = 0

# --- Evaluation Loop ---
# This loop iterates through the specified number of validation samples
for i in tqdm(range(NUM_EVAL_SAMPLES), desc="Evaluating Pipeline"):
    # Ensure we don't go out of bounds of the dataset
    if i >= len(val_dataset_instance):
        print(f"\nWarning: Requested {NUM_EVAL_SAMPLES} samples, but validation set only has {len(val_dataset_instance)}. Stopping.")
        NUM_EVAL_SAMPLES = i
        break

    # Get ground truth data from the dataset
    _, gt_bbox, gt_string, _ = val_dataset_instance[i]
    sample_image_path = val_dataset_instance.image_files[i]

    # Run the full end-to-end pipeline to get predictions
    if USE_YOLO:
      pred_string, pred_bbox, _ = predict_license_plate_yolo(
        image_path=sample_image_path,
        detector_model=loaded_detector,
        recognizer_model=loaded_recognizer,
        dataset_instance=val_dataset_instance,
        device=device,
        full_image_transform=transform_val,
        plate_crop_target_size=PLATE_CROP_TARGET_SIZE
    )
    else:
      pred_string, pred_bbox, _ = predict_license_plate(
          image_path=sample_image_path,
          detector_model=loaded_detector,
          recognizer_model=loaded_recognizer,
          dataset_instance=val_dataset_instance,
          device=device,
          full_image_transform=transform_val,
          plate_crop_target_size=PLATE_CROP_TARGET_SIZE
      )

    # --- Calculate and Aggregate Metrics for this sample ---

    # 1. Detection Metrics
    iou = compute_iou(pred_bbox.cpu().numpy(), gt_bbox.cpu().numpy())
    iou_scores.append(iou)
    is_detection_correct = iou >= IOU_THRESHOLD
    if is_detection_correct:
        correct_detections += 1

    # 2. Recognition Metrics
    is_recognition_correct = (pred_string == gt_string)
    if is_recognition_correct:
        correct_recognitions += 1

    # Character Recognition Rate (CRR)
    for j in range(len(gt_string)):
        if j < len(pred_string) and pred_string[j] == gt_string[j]:
            correct_chars += 1
    total_chars += len(gt_string)

    # 3. End-to-End Metric
    if is_detection_correct and is_recognition_correct:
        e2e_correct_matches += 1

# --- Calculate Final Averages and Accuracies ---
# This happens after the loop has processed all samples
avg_iou = np.mean(iou_scores)
detection_accuracy = correct_detections / NUM_EVAL_SAMPLES if NUM_EVAL_SAMPLES > 0 else 0
recognition_accuracy = correct_recognitions / NUM_EVAL_SAMPLES if NUM_EVAL_SAMPLES > 0 else 0
character_recognition_rate = correct_chars / total_chars if total_chars > 0 else 0
e2e_accuracy = e2e_correct_matches / NUM_EVAL_SAMPLES if NUM_EVAL_SAMPLES > 0 else 0

# --- Print Final Results Summary ---
print("\n" + "="*45)
print("--- FULL PIPELINE EVALUATION RESULTS ---")
print("="*45)
print(f"Evaluated on: {NUM_EVAL_SAMPLES} samples")
print("-" * 45)
print("DETECTION METRICS:")
print(f"  - Average IoU: {avg_iou:.4f}")
print(f"  - Detection Accuracy (IoU >= {IOU_THRESHOLD}): {detection_accuracy * 100:.2f}%")
print("-" * 45)
print("RECOGNITION METRICS:")
print(f"  - Exact Match Accuracy (Full Plate): {recognition_accuracy * 100:.2f}%")
print(f"  - Character Recognition Rate (CRR): {character_recognition_rate * 100:.2f}%")
print("-" * 45)
print("END-TO-END (E2E) METRICS:")
print(f"  - E2E Accuracy (Correct Detection & Recognition): {e2e_accuracy * 100:.2f}%")
print("="*45)