# Computer vision project

- Carlotta Anna Maria Ciani 1881291
- Michela Fuselli 1883535
- Simone Federico Laganà 1946083

Project 4 car plate recognition

Folder structure in this notebook
 -  imports: contains the imports and the necessary libaries for executing the whole code
 - globals: it includes the global variables
 - utils: contains various functions as well as the definition of the evaluator class, and the function to perform baseline plate detection
 - network: defines the networks for PDLPR and the CNN CTC network
 - data: defining the dataset class
 - train: this is divided into the training code for YOLO, for PDLPR and for CNN + CTC under the name Baseline method where there is also the implementation fo the traditional plate detection method.
 - test: Here there is the testing for the single components YOLO, PDLPR and CNN + CTC as well as the pipeline for yolo + pdlpr and the baseline pipeline.

## Imports

In [None]:
%pip install ultralytics
%pip install easyocr
%pip install gdown 

In [None]:
#from google.colab import drive
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.transforms.functional as TF
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim
from torch.optim import Adam, SGD

import numpy as np
from itertools import product
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from PIL import Image
from ultralytics import YOLO
from tqdm import tqdm
import cv2
import easyocr
import json
import gdown

#drive.mount('/content/drive')
#path_to_shared_folder = '/content/drive/MyDrive/cv_project_folder/'

#download the best pdlpr model from google drive, since the file is too big to be stored on the repository
url = "https://drive.google.com/uc?id=1PR8ygH66VKKDOaFpoxzR7aLqc_H5VUtC"
output = "models/pdlpr_10_0.0001_16.pt"
gdown.download(url, output, quiet=False)

## Globals

initialization of all the global variables used in this notebook

In [None]:
#General

#character mapping used for the chinese plates
PROVINCES = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
ALPHABETS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O']
ADS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

#sorted list of all the unique characters that could appear in a plate
CHAR_LIST = sorted(set(PROVINCES + ALPHABETS +ADS))

#dictionary containing the character and the corresponding index
CHAR_IDX = {}
IDX_CHAR = {}
for idx, char in enumerate(CHAR_LIST):
    CHAR_IDX[char] = idx + 1  # start from 1
    IDX_CHAR[idx + 1] = char
IDX_CHAR[0] = '_'  # blank character for CTC

DATASET_PATH_Y      = f"dataset"
IMAGE_SIZE_Y        = 640

#YOLOv5
IOU_THRESHOLD = 0.7
BATCH_SIZE_TRAIN_Y  = 20        # paper:50
BATCH_SIZE_TEST_Y   = 4
EPOCHS_TRAIN_Y      = 25        # paper: 300
LR_INIT_Y           = 0.001     # initial learning rate


#PDLPR
BATCH_SIZE_PDLPR = 16
LR_PDLPR = 1e-4 #0.00001, mostly used
NUM_EPOCHS_PDLPR = 5
WEIGHT_DECAY_PDLPR = 0.0001

#CNN ctc
#(here there are the hyperparameters with the best performances among the ones tried)
BATCH_SIZE_CNN = 32
LR_CNN = 0.001
NUM_EPOCHS_CNN = 60
WEIGHT_DECAY_CNN = 0.0001


## Utils

### useful functions

In [None]:
base_dir = Path(DATASET_PATH_Y)

This code extracts the labels and puts them into folders, structured in a similar way of the dataset

In [None]:
def initialize_labels():
    #initialize the labels and creating the folders
    splits = ["train", "val", "test"]
    for split in splits:
        # It is just a safe and readable way to say: go to datasets/ccpd/images/train (or val, or test), depending on which split you're processing.
        image_dir = base_dir / "images" / split
        label_dir = base_dir / "labels" / split
        crops_dir = base_dir / "crops" / split
        label_pdlpr_dir = base_dir / "labels_pdlpr" / split

        label_dir.mkdir(parents=True, exist_ok=True)    # creates the folder if it does not exist
        crops_dir.mkdir(parents=True, exist_ok=True)
        label_pdlpr_dir.mkdir(parents=True, exist_ok=True)

        # Loop through all .jpg images in the current image directory
        #the tqdm library is useful to plot the loading bar
        for image_path in tqdm(list(image_dir.glob("*.jpg")), desc=f"Processing - {split}", unit="img"):
            #print(f"Found image: {image_path}")
            #print(f"Processing: {image_path.name}")

            # Parse bounding box from filename: example => "XXXXX&x1_x2_y1_y2&..."
            try:
                fields = image_path.stem.split("-")    # image_path.stem is the filename without .jpg

                # Field 2 (index 2) is bbox: format is "x1&y1_x2&y2"
                bbox_part = fields[2]
                corners = bbox_part.split("_")
                x1, y1 = map(int, corners[0].split("&"))
                x2, y2 = map(int, corners[1].split("&"))

                # Define min/max values
                x_min = min(x1, x2)
                x_max = max(x1, x2)
                y_min = min(y1, y2)
                y_max = max(y1, y2)


                #extracting the information about the plate to create the labels for pdlpr
                #the plate is in this format 0_0_22_27_27_33_16
                plate_number = fields[4]
                character_id_list = plate_number.split("_")
                #get the number for the province and for the letter
                province_id = int(character_id_list[0])
                alphabet_id = int(character_id_list[1])
                #get the actual character for both and join them
                province_char = PROVINCES[province_id]
                alphabet_char = ALPHABETS[alphabet_id]
                plate = province_char + alphabet_char

                for i in range(2, 8):
                    #for the remaining 5 characters we do the mapping from the ADS
                    ads_index = int(character_id_list[i])
                    plate += ADS[ads_index]

            except Exception as e:
                print(f"Skipping {image_path.name}: {e}")
                continue
            # Read the image to get image size (needed to normalize the coordinates)
            img = Image.open(image_path)

            img_width, img_height = img.size

            #crop the image according to the bounding box coordinates
            cropped_img = img.crop((x_min, y_min, x_max, y_max))

            #Adding crops so cut images into a separate folder
            crops_path = crops_dir / (image_path.stem + ".jpg")

            #saving the image into the crops folder
            cropped_img.save(crops_path)

            img.close()


            # Normalize the bounding box for YOLO format
            x_center = ((x_min + x_max) / 2) / img_width
            y_center = ((y_min + y_max) / 2) / img_height
            width = (x_max - x_min) / img_width
            height = (y_max - y_min) / img_height

            # Create YOLO label string
            # 0 is the class ID (only one class - license plate)
            # the rest are floats with 6 digits after the decimal point
            label_str = f"0 {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"

            # Save label file with same name
            label_path = label_dir / (image_path.stem + ".txt")
            with open(label_path, "w", encoding="utf-8") as f:
                f.write(label_str + "\n")

            #print(f"Wrote label: {label_path.name}")

            #Save the label for PDLPR
            label_pdl_pr_path = label_pdlpr_dir / (image_path.stem + ".txt")
            with open(label_pdl_pr_path, "w", encoding="utf-8") as f:
                f.write(plate + "\n")

This are some useful functions that are used in this notebook

In [None]:
def yoloprediction_to_pdlpr_input(x_center, y_center, width, height, image_path):
    #This functions takes in input the prediction from yolo and returns the cropped image (so the input for pdlpr)
    img = Image.open(image_path)

    image_width, image_height = img.size

    x_center_pixel = x_center * image_width
    y_center_pixel = y_center * image_height
    width_pixel = width * image_width
    height_pixel = height * image_height

    x_min = int(x_center_pixel - width_pixel / 2)
    x_max = int(x_center_pixel + width_pixel / 2)
    y_min = int(y_center_pixel - height_pixel / 2)
    y_max = int(y_center_pixel + height_pixel / 2)

    #crop the image according to the bounding box coordinates
    cropped_img = img.crop((x_min, y_min, x_max, y_max))

    return cropped_img

This function computes the intersection over union given two boxes [x1, y1, x2, y2]

In [None]:
def compute_iou(box_1, box_2):
    #it is a metric that involves the intersection of the two areas
    #over the union, and returns a matching percentage

    #coputing the coordinate of the intersections
    x1 = max(box_1[0], box_2[0])
    y1 = max(box_1[1], box_2[1])
    x2 = min(box_1[2], box_2[2])
    y2 = min(box_1[3], box_2[3])

    interArea = max(0, x2 - x1) * max(0, y2 - y1)
    boxAArea = (box_1[2] - box_1[0]) * (box_1[3] - box_1[1])
    boxBArea = (box_2[2] - box_2[0]) * (box_2[3] - box_2[1])

    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)  #+1e-6 is used to avoid the division per zero
    return iou

this function converts the id of the target into the ids that are returned by the model since it uses the ids from the sorted list of all the possible characters.

index_to_target converts the index list into the plate characters.

In [None]:
def target_to_index(target_list):
    #this function converts the id of the target
    #into the ids that are returned by the model
    #since it uses the ids from the sorted list of
    #all the possible characters
    output = []
    province = PROVINCES[target_list[0]]
    alphabet = ALPHABETS[target_list[1]]
    output.append(CHAR_IDX[province])
    output.append(CHAR_IDX[alphabet])
    for char_idx in range(2,8):
        char = ADS[target_list[char_idx]]
        output.append(CHAR_IDX[char])
    return output


def index_to_target(index_list):
    output=[]
    for idx in index_list:
        output.append(IDX_CHAR[idx])
    return output


These funcitions aim at retrieving the coordinates of the true bounding box (ground truth), for both validation and testing. The format returned is [x1, y1, x2, y2], if not label is found or it is invalid, None is returned.

In [None]:
def load_gt_box_from_label_validation(image_path):
    label_path = Path(f"dataset/labels/val") / (image_path.stem + ".txt")

    if not label_path.exists():
        print(f"[WARN] No label found for {image_path.name}")
        return None

    with open(label_path, "r") as f:
        lines = f.readlines()

    if not lines:
        print(f"[WARN] Empty label file for {image_path.name}")
        return None

    # Assume the first object only
    try:
        parts = list(map(float, lines[0].strip().split()))
        _, x_center, y_center, w, h = parts
    except Exception:
        print(f"[WARN] Label parse failed for {image_path.name}")
        return None

    # Convert from normalized to absolute coordinates
    img = plt.imread(image_path)
    img_h, img_w = img.shape[:2]

    cx, cy = x_center * img_w, y_center * img_h
    bw, bh = w * img_w, h * img_h

    x1, y1 = cx - bw / 2, cy - bh / 2
    x2, y2 = cx + bw / 2, cy + bh / 2

    return [x1, y1, x2, y2]


def load_gt_box_from_label_test(image_path):
    label_path = Path(f"dataset/labels/test") / (image_path.stem + ".txt")

    if not label_path.exists():
        print(f"[WARN] No label found for {image_path.name}")
        return None

    with open(label_path, "r") as f:
        lines = f.readlines()

    if not lines:
        print(f"[WARN] Empty label file for {image_path.name}")
        return None

    # Assume the first object only
    try:
        parts = list(map(float, lines[0].strip().split()))
        _, x_center, y_center, w, h = parts
    except Exception:
        print(f"[WARN] Label parse failed for {image_path.name}")
        return None

    # Convert from normalized to absolute coordinates
    img = plt.imread(image_path)
    img_h, img_w = img.shape[:2]

    cx, cy = x_center * img_w, y_center * img_h
    bw, bh = w * img_w, h * img_h

    x1, y1 = cx - bw / 2, cy - bh / 2
    x2, y2 = cx + bw / 2, cy + bh / 2

    return [x1, y1, x2, y2]


function that builds the vocabulary (chinese regions) that will be used to build idx2char and char2idx, in particular the inputs are **label_folder** (str), which is the path to folder containing license plate label .txt files, **file_name** (str) is the file .json that will contain the vocabulary for later use and **include_blank** (bool) that tells whether to reserve index 0 for the CTC blank token ('-')

In [None]:
def build_vocab(label_folder, file_name, include_blank=True):
    vocab = set()

    for filename in os.listdir(label_folder):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(label_folder, filename), "r", encoding="utf-8") as f:
            label = f.read().strip().upper()
            vocab.update(label)

    # doing sorting for consinstency
    vocab = sorted(vocab)
    char2idx = {}
    idx2char = {}
    start_idx = 0

    if include_blank:
        char2idx["-"] = 0  # CTC blank
        idx2char[0] = "-"
        start_idx = 1

    for i, ch in enumerate(vocab, start=start_idx):
        char2idx[ch] = i
        idx2char[i] = ch

    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(char2idx, f, ensure_ascii=False, indent=2)

    print(f"[vocab] Built vocabulary with {len(char2idx)} characters.")
    return char2idx, idx2char

Function that loads the vocabulary

In [None]:
def load_vocab(path="vocab.json"):
    with open(path, "r", encoding="utf-8") as f:
        char_idx = json.load(f)
    idx_char = {int(v): k for k, v in char_idx.items()}
    return char_idx, idx_char

Function that plots PDLPR metrics comparing training and validation, in particular the metrics are: character accuracy, sequence accuracy and levenshtein distance

In [None]:
def plot_metrics(train_seq, val_seq, train_char, val_char, train_lev, val_lev):
    epochs = range(1, NUM_EPOCHS_PDLPR + 1)

    plt.figure()
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in train_seq], label="Train Seq Accuracy")
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in val_seq], label="Val Seq Accuracy")
    plt.title("Sequence Accuracy over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"metrics_images/seq_accs_plot_{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.png", dpi=300)

    plt.figure()
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in train_char], label="Train Char Accuracy")
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in val_char], label="Val Char Accuracy")
    plt.title("Char Accuracy over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"metrics_images/char_accs_plot{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.png", dpi=300)

    plt.figure()
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in train_lev], label="Train Levenshtein distance")
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in val_lev], label="Val Levenshtein distance")
    plt.title("Levenshtein distance over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Lev distance")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"metrics_images/levenshtein_plot{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.png", dpi=300)

Functions for processing the batches returned from the dataset class

In [None]:
# functions for pdlpr
def custom_collate_simple(batch):
    return batch

def custom_collate(batch):
    return {
        "cropped_image": torch.stack([item["cropped_image"] for item in batch]),
        "pdlpr_plate_string": [item["pdlpr_plate_string"] for item in batch],
        # add other fields as needed
    }

def custom_collate_2(batch):
    return {
        "cropped_image": torch.stack([item["cropped_image"] for item in batch]),
        "pdlpr_plate_idx": [item["pdlpr_plate_idx"] for item in batch],
    }

### Evaluator class

Evaluator class: this class will help in computing the metrics to evaluate the models (char accuracy, sequence accuracy and lev distance). Since there are a lot of variables to look after, we found more convenient to impement a class.

There are two different versions of the same methods, greedy_decode/greedy_decoede_idx  and update/update_baseline. This because the two models produce slightly different outputs so they need different treatment to decode and when it comes to compare the predictions to the labels.

In [None]:
class Evaluator:
    def __init__(self, idx2char={}, blank_index=0):
        self.idx2char = idx2char
        self.blank_index = blank_index
        self.reset()

    def reset(self):
        self.total_chars = 0
        self.correct_chars = 0
        self.correct_seqs = 0
        self.total_samples = 0

    def greedy_decode(self, logits):
        # logits: [B, T, C]
        predictions = torch.argmax(logits, dim=-1)  # [B, T]
        decoded = []

        for prediction in predictions:
            prev = self.blank_index
            chars = []
            for idx in prediction:
                idx = idx.item()
                if idx != self.blank_index and idx != prev:
                    chars.append(self.idx2char[idx])
                prev = idx
            decoded.append("".join(chars))
        return decoded

    def greedy_decode_idx(self, logits):
        predictions = torch.argmax(logits, dim=2)
        predictions= predictions.transpose(0, 1)
        final_predictions = []
        #iterate for each prediction array in the batch
        for prediction in predictions:
            before = 0
            reduced = []
            for t_index in prediction:
                t_index = t_index.item()
                if t_index != 0 and t_index != before:
                    #append the index only if it is not zero and it is different than before
                    reduced.append(t_index)
                before = t_index
            final_predictions.append(reduced)
        return final_predictions

    def update(self, logits, target_strs):
        # logits: [B, T, vocab_size]
        pred_strs = self.greedy_decode(logits)

        for pred, true in zip(pred_strs, target_strs):
            self.total_samples += 1
            self.total_chars += len(true)
            correct = sum(p == t for p, t in zip(pred, true))
            self.correct_chars += correct
            if pred == true:
                self.correct_seqs += 1

    def update_baseline(self, logits, labels):

        final_predictions = self.greedy_decode_idx(logits)
        for pred_idx_list, label in zip(final_predictions, labels):
            label_list = label.tolist()
            if pred_idx_list == label_list:
                self.correct_seqs +=1

            self.total_samples += 1
            self.total_chars += len(label)
            correct = 0
            for pred_idx, label_idx in zip(pred_idx_list, label):
                if pred_idx == label_idx:
                    correct += 1
            self.correct_chars += correct

    def compute(self):
        char_acc = self.correct_chars / self.total_chars if self.total_chars > 0 else 0.0
        seq_acc = self.correct_seqs / self.total_samples if self.total_samples > 0 else 0.0
        return {
            "char_accuracy": char_acc,
            "seq_accuracy": seq_acc,
        }

    def print(self):
        metrics = self.compute()
        print(f"Character accuracy:  {metrics['char_accuracy']:.4f}")
        print(f"Sequence accuracy:   {metrics['seq_accuracy']:.4f}")

### Function for baseline plate detection

This function is used in the "training" for the plate recognition phase of the baseline methods, it uses traditional techniques

In [None]:

def get_label_yolo(label_path):
    with open(label_path, "r", encoding="utf-8") as f:
        line = f.readline().strip()
    parts = list(map(float, line.split()))
    # It returns x, y, width, height
    return torch.tensor(parts[1:], dtype=torch.float32)


def get_ground_truth_coordinates(yolo_tensor, image_width, image_height):
        cx, cy, w, h = yolo_tensor.tolist()

        x_min = (cx - w / 2) * image_width
        y_min = (cy - h / 2) * image_height
        x_max = (cx + w / 2) * image_width
        y_max = (cy + h / 2) * image_height

        return [x_min, y_min, x_max, y_max]


def plate_detector(image_path, true_coordinates):
    # Load the image
    img = cv2.imread(str(image_path))
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Isolate green parts (the plates have a black text on a green backgroung)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)              # Convert from RGB to HSV (to filter colours basing on the tone)
    lower_green = np.array([40, 40, 40])                    # HSV tone range for green
    upper_green = np.array([80, 255, 255])
    mask = cv2.inRange(hsv, lower_green, upper_green)       # Create a binary mask

    # Morphology
    # The edge detector returns multiple fragmented contours,
    # I need to use morphological operations to bound together near lines
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    cleaned_mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    # Edge detector --> Canny
    edges = cv2.Canny(cleaned_mask, 100, 200)

    # Found the contours (borders)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Geometric filter + OCR check to see if there are numbers/letters
    # Discard regiorns where there are no plates
    candidates = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        aspect_ratio = w / float(h)
        area = w * h
        if not (2.5 < aspect_ratio < 6 and 1000 < area < 40000):
            continue

        roi = img_rgb[y:y+h, x:x+w]
        result = reader.readtext(roi)

        if result:
            text = result[0][1]
            conf = result[0][2]
            clean_text = text.strip().replace(" ", "").replace("\n", "")
        else:
            clean_text = ""
            conf = 0.0

        iou = compute_iou([x, y, x+w, y+h], true_coordinates)
        ocr_score = len(clean_text) if len(clean_text) >= 4 else 0

        # score = iou + 1 * ocr_score     # OCR weights more
        score = 1.5 * (ocr_score / 8.0) + 0.5 * iou     # OCR weights more but I still consider iou

        candidates.append({
            "bbox": [x, y, x+w, y+h],
            "text": clean_text,
            "score": score
        })

    if not candidates:
        return None

    best = max(candidates, key=lambda c: c["score"])
    return best["bbox"], best["text"]


## Data

Definition of the dataset class

In [None]:

class CCPDDataset(Dataset):
    #This class helps to manage the elements from the CCPDD dataset
    #and also initialized the batchloaders used during the training test and validation phases

    def __init__(self, base_dir, transform=None):

        self.base_dir = Path(base_dir)
        self.transform = transform

    def get_dataset(self, split):
        #this is used to set the split from which we are initializing the dataset
        #like train, test, val

        self.image_dir = self.base_dir / "images" / split
        self.label_yolo_dir = self.base_dir / "labels" / split
        #directories for the labels and cropped images that are going to be used
        #in the character recognition part with pdlpr
        self.crops_dir = self.base_dir / "crops" / split
        self.label_pdlpr_dir = self.base_dir / "labels_pdlpr" / split

        # List all image files in the current split's image directory
        self.image_files = sorted(list(self.image_dir.glob("*.jpg")))

        # List the cropped image files if PDLPR needs them directly
        self.cropped_image_files = sorted(list(self.crops_dir.glob("*.jpg")))

        # Basic validation to ensure files exist
        if not self.image_files:
            raise FileNotFoundError(f"No .jpg images found in {self.image_dir}")
        if not self.cropped_image_files:
            raise FileNotFoundError(f"No cropped .jpg images found in {self.crops_dir}")

        return self


    def __len__(self):
        # Returns the total number of samples in the dataset
        return len(self.image_files)


    def __getitem__(self, index):
        # Retrieves a single data sample given the index

        #initializes the paths

        img_path = self.image_files[index]
        yolo_label_path = self.label_yolo_dir / (img_path.stem + ".txt")
        cropped_img_path = self.cropped_image_files[index]
        pdlpr_label_path = self.label_pdlpr_dir / (img_path.stem + ".txt")

        img_name = img_path.name

        # Open images
        # Ensure 'RGB' conversion if images might be grayscale to be consistent for models
        # This helps with GPU optimization as models typically expect 3 channels
        full_image = Image.open(img_path).convert("RGB")
        cropped_image = Image.open(cropped_img_path).convert("RGB")

        # Read YOLO label (bounding box) from the text file
        with open(yolo_label_path, "r", encoding="utf-8") as f:
            yolo_label_str = f.readline().strip()

        # Check if the label file is empty or malformed
        if not yolo_label_str:
            raise ValueError(f"Empty label in {yolo_label_path}")

        # Assuming YOLO format: "class_id x_center y_center width height"
        # We only have one class (0), so we can discard it or keep it

        parts = list(map(float, yolo_label_str.split()))
        # parts is a list of floats like [0.0, 0.5, 0.4, 0.3, 0.1]
        class_id = int(parts[0])
        # discard the first element (class) --> [x_center, y_center, width, height]
        yolo_bbox = torch.tensor(parts[1:], dtype=torch.float32)
        # convert the list of floats into a PyTorch tensor

        # Read PDLPR label (license plate string)
        with open(pdlpr_label_path, "r", encoding="utf-8") as f:
            pdlpr_plate_str = f.readline().strip()

        #Extracting the pdlpr index label that is going to be used by
        #the CNNCTC model
        fields = img_path.name.split("-")
        plate_number = fields[4]
        character_id_list = plate_number.split("_")
        plate_id = []
        for c in character_id_list:
            plate_id.append(int(c))

        #converting the index from the name to the index from the
        #unified vocabulary
        plate_id= target_to_index(plate_id)

        pdlpr_label_idx = torch.tensor(plate_id, dtype=torch.long)

        # apply the transformations
        if self.transform:
            full_image_original = full_image
            full_image = self.transform(full_image)
            cropped_image = self.transform(cropped_image)

        return {
            'full_image_original': full_image_original,
            'full_image': full_image,
            'cropped_image': cropped_image,
            'yolo_bbox_label': yolo_bbox,
            'pdlpr_plate_string': pdlpr_plate_str,
            'pdlpr_plate_idx': pdlpr_label_idx,
            'image_name': img_name
        }


    def get_dataloaders(base_dir, batch_size = 8, transform = None, collate_fn=None):
        #this functions initializes the different dataloaders and returns them
        ds = CCPDDataset(base_dir=base_dir, transform=transform)

        train_loader = DataLoader(ds.get_dataset("train"), batch_size=batch_size, shuffle=True, collate_fn = collate_fn)
        val_loader = DataLoader(ds.get_dataset("val"), batch_size=batch_size, shuffle=False, collate_fn = collate_fn)
        test_loader = DataLoader(ds.get_dataset("test"), batch_size=batch_size, shuffle=False, collate_fn = collate_fn)

        return train_loader, val_loader, test_loader


## Network

### Network for PDLPR

#### IGFE Feature extractor

In [None]:
class FocusStructure(nn.Module):
    def __init__(self, in_channels=3, out_channels=64):
        super(FocusStructure, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels * 4, out_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        patch_tl = x[..., ::2, ::2]
        patch_tr = x[..., ::2, 1::2]
        patch_bl = x[..., 1::2, ::2]
        patch_br = x[..., 1::2, 1::2]
        x = torch.cat([patch_tl, patch_tr, patch_bl, patch_br], dim=1)  # [B, 4C, H/2, W/2]
        return self.conv(x)


In [None]:
class ConvDownSampling(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ConvDownSampling, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1) #here we use as activation function LeakyReLU, which is more used in car plate detection
        )

    def forward(self, x):
        return self.block(x)

In [None]:
class ResBlock(nn.Module):
    def __init__(self, channels):
        super(ResBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(channels),
            nn.LeakyReLU(0.1),
            nn.Conv2d(channels, channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(channels),
        )
        self.relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.relu(x + self.block(x))

In [None]:
class IGFE(nn.Module):
    def __init__(self):
        super(IGFE, self).__init__()
        self.focus = FocusStructure(3, 64)         # From [3,48,144] to [64,24,72]
        self.down1 = ConvDownSampling(64, 128)     # [128,12,36]
        self.res1 = ResBlock(128)
        self.res2 = ResBlock(128)
        self.down2 = ConvDownSampling(128, 256)    # [256,6,18]
        self.res3 = ResBlock(256)
        self.res4 = ResBlock(256)
        self.final_conv = nn.Conv2d(256, 512, kernel_size=1)  # [512,6,18]

    def forward(self, x):
        x = self.focus(x)
        x = self.down1(x)
        x = self.res1(x)
        x = self.res2(x)
        x = self.down2(x)
        x = self.res3(x)
        x = self.res4(x)
        x = self.final_conv(x)
        return x  # [B, 512, 6, 18]

#### Encoder

In [None]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=108):
        super().__init__()
        self.pos_embed = nn.Parameter(torch.randn(1, max_len, dim))  # [1, 108, 512]

    def forward(self, x):
        return x + self.pos_embed  # broadcasting over batch

In [None]:
# Encoder block
class EncoderBlock(nn.Module):
    def __init__(self, dim=512, inner_dim=1024, n_heads=8):
        super().__init__()
        self.expand = nn.Conv1d(dim, inner_dim, kernel_size=1)
        self.attn = nn.MultiheadAttention(embed_dim=inner_dim, num_heads=n_heads, batch_first=True)
        self.reduce = nn.Conv1d(inner_dim, dim, kernel_size=1)
        self.norm = nn.LayerNorm(dim)

    def forward(self, x):
        # x: [B, 108, 512]
        x_in = x
        x = x.transpose(1, 2)  # [B, 512, 108]
        x = self.expand(x)     # [B, 1024, 108]
        x = x.transpose(1, 2)  # [B, 108, 1024]

        attn_out, _ = self.attn(x, x, x)  # self-attention
        x = self.reduce(attn_out.transpose(1, 2)).transpose(1, 2)  # back to [B, 108, 512]
        x = self.norm(x + x_in)  # residual + norm
        return x

In [None]:
# Encoder (3 blocchi)
class PDLPR_Encoder(nn.Module):
    def __init__(self, dim=512, n_heads=8, depth=3):
        super().__init__()
        self.pos_enc = PositionalEncoding(dim)
        self.blocks = nn.Sequential(*[EncoderBlock(dim, 1024, n_heads) for _ in range(depth)])

    def forward(self, x):
        # x: [B, 512, 6, 18]
        B, C, H, W = x.shape
        x = x.view(B, C, H * W)              # [B, 512, 108] -> shape [B, C, N] (Convolutional layers expects this)
        # reorder dimentions for Transformer
        x = x.permute(0, 2, 1)               # [B, 108, 512] -> changes to shape [B, N, C] (Transformer expects this)
        x = self.pos_enc(x)                  # Add positional encoding
        x = self.blocks(x)                   # Encoder blocks
        return x  # [B, 108, 512]

#### Parallel decoder

In [None]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim=2048, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, dim, n_heads=8):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(dim, n_heads, batch_first=True)
        self.cross_attn = nn.MultiheadAttention(dim, n_heads, batch_first=True)
        self.ff = FeedForward(dim)

        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.norm3 = nn.LayerNorm(dim)

    def forward(self, tgt, memory, tgt_mask=None):
        # tgt: [B, T, dim], memory: [B, S, dim]
        x = tgt

        # masked self attention
        attn_out, _ = self.self_attn(x, x, x, attn_mask=tgt_mask)
        x = self.norm1(x + attn_out)

        # cross attention
        attn_out, _ = self.cross_attn(x, memory, memory)
        x = self.norm2(x + attn_out)

        # feedforward neural network
        ff_out = self.ff(x)
        x = self.norm3(x + ff_out)

        return x

In [None]:
class ParallelDecoder(nn.Module):
    def __init__(self, dim=512, vocab_size=70, num_heads=8, num_blocks=3, seq_len=18):
        super().__init__()
        self.seq_len = seq_len
        self.char_embed = nn.Parameter(torch.randn(1, seq_len, dim))
        self.vocab_size = vocab_size
        self.dim = dim

        self.blocks = nn.ModuleList([
            DecoderBlock(dim, num_heads) for _ in range(num_blocks)
        ])
        self.classifier = nn.Linear(dim, vocab_size)

    def update_vocab_size(self, new_vocab_size):

         if new_vocab_size != self.vocab_size:
            print(f"Updating vocab size from {self.vocab_size} to {new_vocab_size}")
            # Save old weights
            old_classifier = self.classifier
            old_out_features = old_classifier.out_features

            # Create new classifier
            new_classifier = nn.Linear(self.dim, new_vocab_size)
            new_classifier = new_classifier.to(old_classifier.weight.device)

            # Copy overlapping weights
            num_to_copy = min(old_out_features, new_vocab_size)
            with torch.no_grad():
                new_classifier.weight[:num_to_copy] = old_classifier.weight[:num_to_copy]
                new_classifier.bias[:num_to_copy] = old_classifier.bias[:num_to_copy]

            self.classifier = new_classifier
            self.vocab_size = new_vocab_size

    def generate_mask(self, size):
        # mask future tokens
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return mask

    def forward(self, memory):
        # memory: [B, S, dim] → encoder output (B, 108, 512])
        B = memory.size(0)
        x = self.char_embed.expand(B, -1, -1)  # [B, T, dim]
        tgt_mask = self.generate_mask(self.seq_len).to(memory.device)  # [T, T]

        for block in self.blocks:
            x = block(x, memory, tgt_mask)

        logits = self.classifier(x)  # [B, T, vocab_size]
        return logits

### Network for character recognition of baseline method CNN CTC

This is the model class for the CNN ctc network used for the second part of the baseline method, there are two dropout layers to decrease the overfitting.

In [None]:
class CNN_CTC_model(nn.Module):
    def __init__(self, num_char, hidden_size):
        super(CNN_CTC_model, self).__init__()
        self.num_char = num_char
        self.hidden_size = hidden_size


        self.features = nn.Sequential(
            # 1 Because we use grayscale images
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(128, self.hidden_size, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )

        #two linear layers to do the final classification
        self.linear = nn.Linear(self.hidden_size * 12, 256)  # 256×12 = concatenazione H dim
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.hidden_size, self.num_char)

    def forward(self, x):
        #the input size is:  Batch, 1, 48 x 144
        x = self.features(x)   #output size: Batch, 256, 12x36

        #since we have 4 elements, the CTC wants the width first so we have to
        #put it into the first position
        x = x.permute(3, 0, 1, 2)  # 36, batch, 256, 12
        #the width so the frames must be more than the number of total characters that
        #we want to encode, so T = width = 36
        x = x.flatten(2)          # 36, batch , 256×12
        x = self.linear(x)
        x = self.dropout(x)           # 36, batch, 256
        x = self.classifier(x)    # 36, batch, num_char

        return x  #returns a tensor of size [numchar] for each one of the 36 positions

## Train

### Train YOLO

In [None]:
def get_model_name():
    return f"yolov5_epochs{EPOCHS_TRAIN_Y}_bs{BATCH_SIZE_TRAIN_Y}_lr{LR_INIT_Y}_imgs{IMAGE_SIZE_Y}.pt"

def get_run_name():
    # Create a unique name for the current training run, using the hyperparams used in the model
    return get_model_name().replace(".pt", "")


def train_yolo():
    model_name = get_model_name()
    run_name = get_run_name()

    if os.path.exists(model_name):
        print(f"[INFO] Model {model_name} already exists ---> SKIP training!!")
        return YOLO(model_name)

    # Create an untrained model based on the configuration params
    model = YOLO("yolov5s.yaml")

    model.train(
        data    = f"ccpd.yaml",         # path to .yaml file for the configuration
        epochs  = EPOCHS_TRAIN_Y,
        batch   = BATCH_SIZE_TRAIN_Y,
        lr0     = LR_INIT_Y,
        imgsz   = IMAGE_SIZE_Y,
        save    = True,                                        # save the training checkpoints and weigths of the final model
        device  = "mps",                                       # TO BE CHANGED ACCORDING TO PC --> "cpu"
        project = f"runs/train",        # directory where to save the outputs of training
        name    = model_name.replace(".pt", ""),               # create a subdir in the project folder, where to save training logs and outputs
        val     = True,                                        # run validation here to create results.csv and .png
        plots   = True
    )

    model.save(model_name)

    best_model_path = f"runs/train/{run_name}/weights/best.pt"

    return best_model_path




# TRAIN
train_model_path = train_yolo()

run_name = get_run_name()

# VALIDATION after training
# Load and use the best model best.pt --> create a model instance initializzed with the trained weights
best_model = YOLO(train_model_path, verbose = False)
# best_model = YOLO("/Users/michelafuselli/Desktop/Michi/Università/Magistrale/Computer Vision/Project/CV_project/runs/train/yolov5_epochs20_bs8_lr0.001_imgs6402/weights/best.pt", verbose = False)

# Inside results: mAP@0.5, mAP@0.5:0.95. precision, recall, confusion matrix, curva PR, curva f1, ... --> are saved in runs/detect
results = best_model.val(
    data    = f"ccpd.yaml",
    split   = 'val',
    iou     = IOU_THRESHOLD,
    device  = "cpu",
    name    = f"{run_name}_VAL_iou{int(IOU_THRESHOLD*100)}",
)

image_dir = Path(f"dataset/images/val")
output_dir = Path(f"runs/val") / f"{get_run_name()}_VAL_iou{int(IOU_THRESHOLD * 100)}"
output_dir.mkdir(parents=True, exist_ok=True)

iou_list = []

# Loop over images
for image_path in sorted(image_dir.glob("*.jpg")):
    # Predict
    result = best_model(image_path, max_det=5, verbose = False)[0]
    predictions = result.boxes.xyxy.cpu().numpy()  # shape: (N, 4)

    real_box = load_gt_box_from_label_validation(image_path)
    if real_box is None:
        # skip image if no GT or invalid
        continue

    # Compute IoU between every predicted box and the true one
    for predicted_box in predictions:
        iou = compute_iou(predicted_box, real_box)
        iou_list.append(iou)

# Compute average among all iou values
if iou_list:
    mean_iou = sum(iou_list) / len(iou_list)
else:
    mean_iou = 0.0

# Save in .txt
txt_path = output_dir / "mean_iou.txt"
with open(txt_path, "w") as f:
    f.write(f"Mean IoU over validation set: {mean_iou:.4f}\n")

print(f"[INFO] Mean IoU saved to {txt_path}")


### PDLPR training function - IGFE, Encoder, Parallel decoder

Validation function

In [None]:
def validate(model_parts, evaluator, val_loader, char_idx, idx_char, device):
    igfe, encoder, decoder = model_parts

    igfe.eval()
    encoder.eval()
    decoder.eval()

    evaluator = Evaluator(idx_char)
    pbar = tqdm(val_loader, desc=f"Validating")

    with torch.no_grad():
        for batch in pbar:
            images = batch["cropped_image"].to(device)
            label_strs = batch["pdlpr_plate_string"]

            unknown = set(c for s in label_strs for c in s if c not in char_idx)
            if unknown:
                # updating vocabulary
                for c in ''.join(label_strs):
                    if c not in char_idx:
                        idx = len(char_idx)
                        char_idx[c] = idx
                        idx_char[idx] = c
                print(f"unknown character {c}. Vocabulary updated")
                decoder.update_vocab_size(len(char_idx))

            # Forward
            features = igfe(images)
            encoded = encoder(features)
            logits = decoder(encoded)

            evaluator.update(logits, label_strs)

    metrics = evaluator.compute()
    evaluator.print()
    return metrics

Training function

In [None]:
def train(model_parts, evaluator, train_loader, val_loader, char_idx, idx_char, num_epochs, optimizer ,device):

    igfe, encoder, decoder = model_parts
    total_loss = 0

    train_losses = []
    train_seq_accs = []
    train_char_accs = []

    val_char_accs = []
    val_seq_accs = []

    for epoch in range(num_epochs):
        evaluator = Evaluator(idx2char=idx_char)

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in loop:
            images = batch["cropped_image"].to(device)
            label_strs = batch["pdlpr_plate_string"]

            # update the vocabulary if unkwon character found
            unknown = set(c for s in label_strs for c in s if c not in char_idx)
            if unknown:
                # updating vocabulary
                for c in ''.join(label_strs):
                    if c not in char_idx:
                        idx = len(char_idx)
                        char_idx[c] = idx
                        idx_char[idx] = c
                print(f"unknown character {c}. Vocabulary updated")
                # update the decoder with the new vocabulary size but keeping the old weights
                decoder.update_vocab_size(len(char_idx))

            # lable encoding in ordr to compute loss
            targets = torch.tensor([char_idx[c] for s in label_strs for c in s], dtype=torch.long).to(device)
            target_lengths = torch.tensor([len(s) for s in label_strs], dtype=torch.long).to(device)
            input_lengths = torch.full((images.size(0),), 18, dtype=torch.long).to(device)

            # Forward pass
            features = igfe(images)
            encoded = encoder(features)
            logits = decoder(encoded)
            log_probs = logits.log_softmax(2).permute(1, 0, 2)  # [T, B, C]

            loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
            total_loss += loss.item()
            train_losses.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # metrics updating using the evaluator
            evaluator.update(logits, label_strs)
            loop.set_postfix(loss=loss.item())

        print(f"Metrics at epoch {epoch+1}")
        evaluator.print()
        metrics = evaluator.compute()
        train_seq_accs.append(metrics["seq_accuracy"])
        train_char_accs.append(metrics["char_accuracy"])


        #saving the new vocabulary
        with open(f"vocab.json", "w", encoding="utf-8") as f:
            json.dump(char_idx, f, ensure_ascii=False, indent=2)

        # VALIDATION
        val_evaluator = Evaluator(idx_char)
        val_metrics = validate(
        model_parts=(igfe, encoder, decoder),
        evaluator=val_evaluator,
        val_loader=val_loader,
        char_idx=char_idx,
        idx_char=idx_char,
        device=device
    )
        val_seq_accs.append(val_metrics["seq_accuracy"])
        val_char_accs.append(val_metrics["char_accuracy"])

    #Saving the model for testing, the models will have as input the images cropped by YOLO
    torch.save({
            'epoch': epoch + 1,
            'igfe_state_dict': igfe.state_dict(),
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'loss': total_loss,
            'train_losses': train_losses,
            'train_seq_accs': train_seq_accs,
            'train_char_accs': train_char_accs
        }, f'models/pdlpr_{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.pt')
    print(f"Model saved in models/pdlpr_final.pt")

    print("END OF TRAINING, results:\n")

    print(f"number of epochs: {num_epochs}")
    print(f"learning rate: {LR_PDLPR}")
    print(f"batch size: {BATCH_SIZE_PDLPR}")
    print(f"Loss: {total_loss / len(train_loader):.4f}")
    evaluator.print()


    train_seq_accuracy = metrics['seq_accuracy']
    val_seq_accuracy = val_metrics["seq_accuracy"]
    train_char_accuracy = metrics['char_accuracy']
    val_char_accuracy = metrics['char_accuracy']

    with open(f"results/PDLPR-{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.txt", "w") as f:
        f.write(f"Final train accuracy: {train_seq_accuracy:.4f}\n")
        f.write(f"Final validation accuracy: {val_seq_accuracy:.4f}\n")
        f.write(f"Final character train accuracy: {train_char_accuracy:.4f}\n")
        f.write(f"Final character validation accuracy: {val_char_accuracy:.4f}\n")

    print(f"results saved in results/PDLPR-{NUM_EPOCHS_PDLPR}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.txt")

    # plot loss over epochs
    epochs = range(1, len(train_losses)+1)
    plt.figure()
    plt.plot(epochs, [l.detach().cpu().item() if torch.is_tensor(l) else l for l in train_losses], label="Train Loss")
    plt.title("Loss over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("CTC Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"metrics_images/loss_plot_{num_epochs}_{LR_PDLPR}_{BATCH_SIZE_PDLPR}.png", dpi=300)

    # plot train and validation metrics
    print("Plotting metrics.........")
    plot_metrics(train_seq_accs, val_seq_accs, train_char_accs, val_char_accs)

    return train_losses, train_seq_accs, train_char_accs

Defining transformations to apply on the images and load data

In [None]:
if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize((48, 144)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])

    # loading data

    dataset = CCPDDataset(base_dir=f"dataset", transform=transform)
    train_loader, val_loader, test_loader = CCPDDataset.get_dataloaders(
        base_dir=f"dataset",
        batch_size=BATCH_SIZE_PDLPR,
        transform=transform,
        collate_fn= custom_collate
    )

Create the vocabulary if it does not already exist and defining mapping char to index and index to char

In [None]:
# loading vocabulary
if os.path.exists(f'vocab.json'):
    print(f"vocab.json found — loading...")
    char_idx, idx_char = load_vocab(f'vocab.json')
else:
    print("vocab.json not found — building it from labels...")
    char_idx, idx_char = build_vocab(f"dataset/labels_pdlpr/train", "vocab.json")

vocab_size = len(char_idx)

Defining training arguments

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluator = Evaluator(idx_char)
ctc_loss = nn.CTCLoss(blank=0)
decoder_seq_len = 18  # From ParallelDecoder
decoder = ParallelDecoder(dim=512, vocab_size=vocab_size, seq_len=decoder_seq_len).to(device).train()
encoder = PDLPR_Encoder().to(device).train()
igfe = IGFE().to(device).train()
params = list(igfe.parameters()) + list(encoder.parameters()) + list(decoder.parameters())
optimizer = optim.Adam(params, lr=LR_PDLPR)

Training

In [None]:
print("Starting training..........")
train_char_accs, train_seq_accs, train_losses = train(
    model_parts=(igfe, encoder, decoder),
    evaluator=evaluator,
    train_loader=train_loader,
    val_loader=val_loader,
    char_idx=char_idx,
    idx_char=idx_char,
    num_epochs=NUM_EPOCHS_PDLPR,
    optimizer=optimizer,
    device=device
)

### train Baseline method

Training of the second part of character recognition, since the first part of licence plate detection was implemented using traditional techniques it doesn't make sense to distinguish between train and testing.

So this is the code for the first part

In [None]:
# Initialize the reader just once: simplified chinese and english
reader = easyocr.Reader(['ch_sim', 'en'])

images_dir = Path(f"dataset/images/train/")
labels_dir = Path(f"dataset/labels/train/")

diff_results_dir = Path("results")
diff_results_dir.mkdir(parents=True, exist_ok=True)
diff_results_txt = diff_results_dir / f"BL_iou_ocr6.txt"
open(diff_results_txt, "w").close()

total_iou = 0.0
num_iou = 0
num_passed_iou = 0      # counts values >= 0.7


for image_path in tqdm(images_dir.glob("*.jpg"), desc="Processing images", unit="img"):
    image_name = image_path.name
    label_path = labels_dir / (image_path.stem + ".txt")

    if not label_path.exists():
        print(f"NO label for {image_name}")
        continue

    pil_image = Image.open(image_path).convert("RGB")
    yolo_tensor = get_label_yolo(label_path)
    width, height = pil_image.size

    true_coordinates = get_ground_truth_coordinates(yolo_tensor, width, height)

    candidate_bounding_box = plate_detector(image_path, true_coordinates)

    if not candidate_bounding_box:
        with open(diff_results_txt, "a") as f:
            f.write(f"{image_name}\n")
            f.write("IoU: 0.000\n")
            f.write("OCR: NONE\n")
            f.write(f"Box GT: {true_coordinates}\n")
            f.write("Box Pred: NONE\n")
            f.write("---\n")

        # count it as missing
        total_iou += 0.0
        num_iou += 1
        continue

    predict_bbox, ocr_text  = candidate_bounding_box
    iou_diff = compute_iou(predict_bbox, true_coordinates)

    total_iou += iou_diff
    num_iou += 1
    if iou_diff >= IOU_THRESHOLD:
        num_passed_iou += 1

    with open(diff_results_txt,  "a", encoding = "utf-8") as f:
        f.write(f"{image_name}\n")
        f.write(f"IoU: {iou_diff:.3f}\n")
        f.write(f"OCR: {ocr_text}\n")
        f.write(f"Box GT: {true_coordinates}\n")
        f.write(f"Box Pred: {predict_bbox}\n")
        f.write("---\n")

if num_iou > 0:
    avg_iou = total_iou / num_iou
    pass_rate = (num_passed_iou / num_iou) * 100
else:
    avg_iou = 0.0
    pass_rate = 0.0


with open(diff_results_txt, "a") as f:
    f.write(f"\n AVERAGE IoU over {num_iou} predictions: {avg_iou:0.4f}")
    f.write(f"\n IoU pass rate (>= 0.7) {pass_rate:0.2f}")

print(f"\n AVERAGE IoU over {num_iou} predictions: {avg_iou:0.4f}")
print(f"\n IoU pass rate (>= 0.7) {pass_rate:0.2f}")


In the training of CNN + CTC we try different hyperparameters combinations to select the best ones. The resulting images and txt files are saved in local folders.

In [None]:
#Hyperparameters combination
batch_sizes = [64, 32]  
learning_rates = [0.001]
weight_decays = [1e-4, 5e-4]
epochs = [40, 60]

CHAR_LIST = sorted(set(PROVINCES+ALPHABETS+ADS))
PLATE_LENGTH = 8

NUM_CHAR = len(CHAR_LIST) + 1 #since we include the blank character

combinations = product(batch_sizes, learning_rates, weight_decays, epochs)

#executing the training and testing for all the possible combinations to get the best one
for bs, lr, wd, ne in combinations:

    #Hyperparameters
    BATCH_SIZE = bs
    LR = lr
    WEIGHT_DECAY = wd
    NUM_EPOCHS = ne

    SAVE_NAME = f"n_epochs_{NUM_EPOCHS}_bs_{BATCH_SIZE}_LR_{LR}_wd_{WEIGHT_DECAY}"

    print(f"training with {SAVE_NAME}")

    model = CNN_CTC_model(num_char=NUM_CHAR, hidden_size=256)
    ctc_loss = nn.CTCLoss(blank=0)

    preprocess = transforms.Compose([
        transforms.Grayscale(),              # converte in 1 canale
        transforms.Resize((48, 144)),       # adatta a H=48, W=144
        transforms.ToTensor(),              # [C, H, W]
        transforms.Normalize((0.5,), (0.5,))
    ])


    train_dataloader, val_dataloader, test_dataloader = CCPDDataset.get_dataloaders(base_dir=DATASET_PATH_Y, batch_size=BATCH_SIZE, transform=preprocess, collate_fn = custom_collate_2)
     #this optimizer uses stochastic gradient descent and has in input the parameters (weights) from
    #the pretrained model
    optimizer = Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    #optimizer = SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=WEIGHT_DECAY)

    #initialize the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    accuracy_val=[]
    accuracy_train=[]
    total_train_loss=[]
    char_accuracy_train =[]
    char_accuracy_val =[]

    epsilon = 1e-6
    #TRAIN LOOP we are doing fine tuning on the task of recognizing plate
    for e in range(NUM_EPOCHS):
        evaluator = Evaluator()
        model.train()
        train_loss= 0.0
        train_acc = []
        val_acc = []
        train_char_acc =[]
        val_char_acc = []
        B_size = 0
        i=0
        #does the for loop for all the items in the same batch
        for batch in train_dataloader:
            print(f"Batch {i + 1}/{len(train_dataloader)}")
            images = batch["cropped_image"]
            labels = batch["pdlpr_plate_idx"]

            images = [img.to(device) for img in images]
            labels = [lab.to(device) for lab in labels]

            # Stack per batch processing
            images = torch.stack(images)
            labels = torch.stack(labels)

            #Ctc loss expects a simple list not a 2 dimensional tensor, so all the batch
            #index are flattened into one single list
            flat_labels_list = labels.view(-1)
            #we get the output of the models and apply softmax to turn it into probability
            output_logits = model(images)
            output_probabilities = F.log_softmax(output_logits, dim=2)
            #the output of the model are T vectors for the batch size
            T = output_logits.size(0)
            #get the current batch size
            B_size = images.size(0)
            #creates a tensor the length of the batch size filled with the dimention of the input
            #and the dimension of the output, since ctc requires the lengths because it uses one big
            #vector
            input_lengths = torch.full((B_size,), T, dtype=torch.long).to(device)
            target_lengths = torch.full((B_size,), PLATE_LENGTH, dtype=torch.long).to(device)

            #CTC loss
            loss = ctc_loss(output_probabilities, flat_labels_list, input_lengths, target_lengths)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            evaluator.reset()
            evaluator.update_baseline(output_logits, labels)

            ##we take the index of words with the highest probabilities
            metrics = evaluator.compute()
            #metrics for the whole batch
            mean_batch_train_char_acc = metrics["char_accuracy"]
            mean_batch_train_acc = metrics["seq_accuracy"]
            #print(mean_batch_train_char_acc)
            #print(mean_batch_train_acc)
            train_acc.append(mean_batch_train_acc)
            train_char_acc.append(mean_batch_train_char_acc)

            i+=1

        #compute the mean of the full and character accuracy for training
        #for the whole epoch
        mean_train_acc = sum(train_acc)/len(train_acc)
        mean_train_char_acc = sum(train_char_acc)/len(train_char_acc)
        train_loss = train_loss/B_size

        #append the result to lists in order to plot them
        accuracy_train.append(mean_train_acc)
        char_accuracy_train.append(mean_train_char_acc)
        total_train_loss.append(train_loss)

        j=0
        #Validation phase
        model.eval()
        with torch.no_grad():
            for batch in val_dataloader:
                print(f"Batch {j + 1}/{len(val_dataloader)}")
                images = batch["cropped_image"]
                labels = batch["pdlpr_plate_idx"]

                images = [img.to(device) for img in images]
                labels = [lab.to(device) for lab in labels]

                images = torch.stack(images)
                labels = torch.stack(labels)

                flat_labels_list = labels.view(-1)

                output_logits = model(images)

                evaluator.reset()
                evaluator.update_baseline(output_logits, labels)
                metrics = evaluator.compute()

                #metrics for the whole batch
                mean_batch_val_char_acc = metrics["char_accuracy"]
                mean_batch_val_acc = metrics["seq_accuracy"]
                #print(mean_batch_val_char_acc)
                #print(mean_batch_val_acc)
                val_acc.append(mean_batch_val_acc)
                val_char_acc.append(mean_batch_val_char_acc)

                j+=1

        #compute the mean of the iou validation score
        mean_val_acc = sum(val_acc)/len(val_acc)
        mean_val_char_acc = sum(val_char_acc)/len(val_char_acc)

        accuracy_val.append(mean_val_acc)
        char_accuracy_val.append(mean_val_char_acc)

        print(f"Epoch {e +1}/{NUM_EPOCHS} - train loss: {train_loss} - train acc: {mean_train_acc} - train char acc: {mean_train_char_acc} - val acc: {mean_val_acc} --  val char acc: {mean_val_char_acc}" )

    #Saving the model
    torch.save(model.state_dict(), f"models/CNNCTC-{SAVE_NAME}.pth")

    #Plotting the figure for the train and validation
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, NUM_EPOCHS + 1), accuracy_train, label="train acc", marker='o')
    plt.plot(range(1, NUM_EPOCHS + 1), accuracy_val, label="validation acc", marker='s')
    plt.xlabel("epoch")
    plt.ylabel("accuracy")
    plt.title("Train and validation plate accuracy per epoch")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    #plt.show()
    plt.savefig(f"metrics_images/train_validation_CNNCTC-{SAVE_NAME}.png")

    #Plotting the figure for the train and validation
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, NUM_EPOCHS + 1), char_accuracy_train, label="char train acc", marker='o')
    plt.plot(range(1, NUM_EPOCHS + 1), char_accuracy_val, label="char validation acc", marker='s')
    plt.xlabel("epoch")
    plt.ylabel("accuracy")
    plt.title("Train and validation character accuracy per epoch")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    #plt.show()
    plt.savefig(f"metrics_images/char_train_validation_CNNCTC-{SAVE_NAME}.png")

    #getting the last iou value for train and validation
    final_train_acc = accuracy_train[-1]
    final_val_acc = accuracy_val[-1]

    final_char_train_acc = char_accuracy_train[-1]
    final_char_val_acc = char_accuracy_val[-1]

    with open(f"results/CNNCTC-{SAVE_NAME}.txt", "w") as f:
        f.write(f"Final train accuracy: {final_train_acc:.4f}\n")
        f.write(f"Final validation accuracy: {final_val_acc:.4f}\n")
        f.write(f"Final character train accuracy: {final_char_train_acc:.4f}\n")
        f.write(f"Final character validation accuracy: {final_char_val_acc:.4f}\n")


## Test

### test Yolo

In [None]:
def get_model_name():
    return f"yolov5_epochs{EPOCHS_TRAIN_Y}_bs{BATCH_SIZE_TRAIN_Y}_lr{LR_INIT_Y}_imgs{IMAGE_SIZE_Y}.pt"


def get_run_name():
    return get_model_name().replace(".pt", "")


def test_yolo():
    model_name = get_model_name()
    # Load the actual trained model weights --> this ensures that testing is run on the best version of the model
    model_path = Path(f"runs/train") / model_name.replace(".pt", "") / "weights" / "best.pt"

    if not model_path.exists():
        raise FileNotFoundError(f"Model file not found at {model_path}")


    # Retrive the name of the training run
    run_name = get_run_name()

    output_name = f"{Path(model_path).stem}_TEST_iou{int(IOU_THRESHOLD * 100)}"
    output_dir = Path(f"runs/test") / output_name
    output_dir.mkdir(parents=True, exist_ok=True)

    model = YOLO(model_path)

    results = model.val(
        data    = f"ccpd.yaml",
        split   = "test",
        iou     = IOU_THRESHOLD,
        device  = "cpu",
        name    = f"{run_name}_TEST_iou70",
        project = f"runs/test"
    )

    return results, model, output_dir


print(f"")
model_path = f"runs/train/yolov5_epochs30_bs12_lr0.001_imgs640/weights"

run_name = Path(model_path).parent.parent.name

output_name = f"{Path(model_path).stem}_TEST_iou{int(IOU_THRESHOLD * 100)}"

# TESTING
results, model_test, test_output_dir = test_yolo()

# Save metrics in a file
metrics_path = test_output_dir / "test_metrics.txt"
test_output_dir.mkdir(parents=True, exist_ok=True)
with open(metrics_path, "w") as f:
    f.write(f"Model: {model_path}\n")
    f.write(f"IoU Threshold: {IOU_THRESHOLD}\n\n")
    f.write(f"mAP@0.5:      {results.box.map50:.4f}\n")
    f.write(f"mAP@0.5:0.95: {results.box.map:.4f}\n")
    f.write(f"Precision:    {results.box.mp:.4f}\n")
    f.write(f"Recall:       {results.box.mr:.4f}\n")


# Compute IoU
image_dir = Path(f"dataset/images/test")

iou_list = []

# Loop over images
for image_path in sorted(image_dir.glob("*.jpg")):
    # Predict
    result = model_test(str(image_path), max_det=5, verbose = False)[0]
    predictions = result.boxes.xyxy.cpu().numpy()  # shape: (N, 4)

    real_box = load_gt_box_from_label_test(image_path)
    if real_box is None:
        # skip image if no GT or invalid
        continue

    # Compute IoU between every predicted box and the true one
    for predicted_box in predictions:
        iou = compute_iou(predicted_box, real_box)
        iou_list.append(iou)

# Compute average among all iou values
if iou_list:
    mean_iou = sum(iou_list) / len(iou_list)
else:
    mean_iou = 0.0

# Save in test_metrics.txt
with open(metrics_path, "a") as f:
    f.write(f"Mean IoU over test set: {mean_iou:.4f}\n")

print(f"[INFO] Mean IoU saved to {metrics_path}")



print("\n TESTING complete!")
print(f"Results saved to: runs/test/{test_output_dir}")



### Test PDLPR

Initializing parameters and load trained pdlpr model

In [None]:
if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize((48, 144)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])

    # loading data

    dataset = CCPDDataset(base_dir=f"dataset", transform=transform)
    train_loader, val_loader, test_loader = CCPDDataset.get_dataloaders(
        base_dir=f"dataset",
        batch_size=BATCH_SIZE_PDLPR,
        transform=transform,
        collate_fn= custom_collate
    )

In [None]:
# loading vocabulary
if os.path.exists(f'vocab.json'):
    print(f"vocab.json found — loading...")
    char_idx, idx_char = load_vocab(f'vocab.json')
else:
    print("vocab.json not found — building it from labels...")
    char_idx, idx_char = build_vocab(f"dataset/labels_pdlpr/train", "vocab.json")

vocab_size = len(char_idx)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluator = Evaluator(idx_char)
ctc_loss = nn.CTCLoss(blank=0)
decoder_seq_len = 18  # From ParallelDecoder
decoder = ParallelDecoder(dim=512, vocab_size=vocab_size, seq_len=decoder_seq_len).to(device).train()
encoder = PDLPR_Encoder().to(device).train()
igfe = IGFE().to(device).train()

# load pre trained model if needed
if os.path.exists(f'models/pdlpr_10_0.0001_16.pt'):
    print("checkpoint found. Loading state dict......")
    checkpoint = torch.load( f'models/pdlpr_10_0.0001_16.pt', map_location=device)
    igfe.load_state_dict(checkpoint["igfe_state_dict"])
    encoder.load_state_dict(checkpoint["encoder_state_dict"])
    decoder.load_state_dict(checkpoint["decoder_state_dict"])
    #optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

else:
    print("checkpoint not found. Please train the model first")

Test function PDLPR

In [None]:
def test(model_parts, evaluator, test_loader, char_idx, idx_char, device):
    igfe, encoder, decoder = model_parts

    # keep track of metrics for plot
    test_seq_accs = []
    test_char_accs = []

    igfe.eval()
    encoder.eval()
    decoder.eval()

    evaluator = Evaluator(idx_char)
    pbar = tqdm(test_loader, desc=f"Testing")

    with torch.no_grad():
        for batch in pbar:
            images = batch["cropped_image"].to(device)
            label_strs = batch["pdlpr_plate_string"]

            unknown = set(c for s in label_strs for c in s if c not in char_idx)
            if unknown:
                # updating vocabulary
                for c in ''.join(label_strs):
                    if c not in char_idx:
                        idx = len(char_idx)
                        char_idx[c] = idx
                        idx_char[idx] = c
                print(f"unknown character {c}. Vocabulary updated")
                # update the decoder with the new vocabulary size but keeping the old weights
                decoder.update_vocab_size(len(char_idx))


            # Forward
            features = igfe(images)
            encoded = encoder(features)
            logits = decoder(encoded)

            evaluator.update(logits, label_strs)
            metrics = evaluator.compute()
            test_seq_accs.append(metrics["seq_accuracy"])
            test_char_accs.append(metrics["char_accuracy"])


    #saving the new vocabulary
    with open("vocab.json", "w", encoding="utf-8") as f:
        json.dump(char_idx, f, ensure_ascii=False, indent=2)

    metrics = evaluator.compute()
    evaluator.print()
    return metrics, test_char_accs, test_seq_accs

Start testing

In [None]:
print("Starting testing............")
test_metrics, test_char_accs, test_seq_accs = test(
    model_parts=(igfe, encoder, decoder),
    evaluator=evaluator,
    test_loader=test_loader,
    char_idx=char_idx,
    idx_char=idx_char,
    device=device
)

### Test pipeline yolo + PDLPR

Function that crops the images using trained yolo model

In [None]:
def crop_image_yolo(yolo_model, image):
    #image = Image.open(image_path).convert("RGB")
    results = yolo_model(image, verbose=False)[0]  # Results object
    boxes = results.boxes

    if boxes is None or len(boxes) == 0:
        print("No car plate detected")
        return None

    bbox = boxes.xyxy[0].cpu().numpy().astype(int)
    x1, y1, x2, y2 = bbox

    cropped_img = image.crop((x1, y1, x2, y2))

    #plt.imshow(cropped_img)
    #plt.title("Targa rilevata (crop YOLO)")
    #plt.axis("off")
    #plt.show()

    return cropped_img

Function that tests the total pipeline

In [None]:
def test(model_parts, yolo_model, transform, evaluator, test_loader, char_idx, idx_char, device):
    igfe, encoder, decoder = model_parts

    igfe.eval()
    encoder.eval()
    decoder.eval()

    predicted_strings = []

    evaluator = Evaluator(idx_char)
    pbar = tqdm(test_loader, desc=f"Testing")

    with torch.no_grad():
        for batch in pbar:
            # cropping images with yolo
            sample = batch[0]
            img = sample["full_image_original"]
            label_strs = sample["pdlpr_plate_string"]

            images = []

            # Crop using YOLO
            cropped_img = crop_image_yolo(yolo_model, img)
            if cropped_img is None:
                print("No plate detected — skipping image")
                continue
            # Transform cropped image
            tensor = transform(cropped_img).unsqueeze(0).to(device)
            images.append(tensor)

            if len(images) == 0:
                continue  # skip batch if all images failed

            images = torch.cat(images, dim=0)

            unknown = set(c for s in label_strs for c in s if c not in char_idx)
            if unknown:
                # updating vocabulary
                for c in ''.join(label_strs):
                    if c not in char_idx:
                        idx = len(char_idx)
                        char_idx[c] = idx
                        idx_char[idx] = c
                print(f"unknown character {c}. Vocabulary updated")
                # update the decoder with the new vocabulary size but keeping the old weights
                decoder.update_vocab_size(len(char_idx))


            # Forward
            features = igfe(images)
            encoded = encoder(features)
            logits = decoder(encoded)

            evaluator.update(logits, [label_strs])
            pred_str = evaluator.greedy_decode(logits)
            predicted_strings.append(pred_str)
            #print(f"traget string: {label_strs},  Predicted: {pred_str}")

        metrics = evaluator.compute()
        evaluator.print()

    return metrics, predicted_strings

In [None]:
# loading yolo model
yolo_model = YOLO(f"runs/train/yolov5_epochs20_bs8_lr0.001_imgs640/weights/best.pt")

# Load data
dataset = CCPDDataset(base_dir=f"dataset", transform=transform)
_, _, test_loader = CCPDDataset.get_dataloaders(
    base_dir=f"dataset",
    batch_size=1,
    transform=transform,
    collate_fn=custom_collate_simple
)
# loading vocabulary
if os.path.exists(f'vocab.json'):
    print(f"vocab.json found — loading...")
    char2idx, idx2char = load_vocab(f'vocab.json')
else:
    print("vocab.json not found — building it from labels...")
    char2idx, idx2char = build_vocab(f"dataset/labels_pdlpr/train", "vocab.json")

vocab_size = len(char2idx)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

evaluator = Evaluator(idx2char)

In [None]:
print("start testing.........")
metrics, predicted_strings = test(
    model_parts=(igfe, encoder, decoder),
    yolo_model = yolo_model,
    transform=transform,
    evaluator=evaluator,
    test_loader=test_loader,
    char_idx=char2idx,
    idx_char=idx2char,
    device=device
)

print("First 5 predicted strings:", predicted_strings[:5])

### test Baseline method

testing for the second part of the baseline method that uses CNN CTC, in order to execute this is important to initialize the Evaluator class which is found in Train

In [None]:
CHAR_LIST = sorted(set(PROVINCES+ALPHABETS+ADS))
PLATE_LENGTH = 8

NUM_CHAR = len(CHAR_LIST) + 1 #since we include the blank character

BATCH_SIZE = 32
LR = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 60

SAVE_NAME = f"n_epochs_{NUM_EPOCHS}_bs_{BATCH_SIZE}_LR_{LR}_wd_{WEIGHT_DECAY}"
model = CNN_CTC_model(num_char=NUM_CHAR, hidden_size=256)
ctc_loss = nn.CTCLoss(blank=0)

preprocess = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((48, 144)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

_, _, test_dataloader = CCPDDataset.get_dataloaders(base_dir=DATASET_PATH_Y, batch_size=BATCH_SIZE, transform=preprocess, collate_fn= custom_collate_2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#TESTING PHASE
if os.path.exists(f"models/CNNCTC-{SAVE_NAME}.pth"):
    model.load_state_dict(torch.load(f"models/CNNCTC-{SAVE_NAME}.pth"))
    model.to(device)
else:
    print("model not found. Please train the model first")
model.eval()
test_acc = []
char_test_acc = []

evaluator = Evaluator()
#here we just loop throught the test data and compute the accuracy scores
with torch.no_grad():
    for batch in test_dataloader:
        images = batch["cropped_image"]
        labels = batch["pdlpr_plate_idx"]
        images = [img.to(device) for img in images]
        labels = [lab.to(device) for lab in labels]
        images = torch.stack(images)
        labels = torch.stack(labels)
        flat_labels_list = labels.view(-1)

        output_logits = model(images)
        output_probabilities = F.log_softmax(output_logits, dim=2)
        evaluator.reset()
        evaluator.update_baseline(output_logits, labels)
        metrics = evaluator.compute()
        #metrics for the whole batch
        mean_batch_test_char_acc = metrics["char_accuracy"]
        mean_batch_test_acc = metrics["seq_accuracy"]
        test_acc.append(mean_batch_test_acc)
        char_test_acc.append(mean_batch_test_char_acc)

mean_acc = sum(test_acc) / len(test_acc)
mean_char_acc = sum(char_test_acc)/len(char_test_acc)
print(f"Test result accuracy: {mean_acc:.4f}")
print(f"Test result char accuracy: {mean_char_acc:.4f}")
#saving the iou result of the training, validation (last step) and testing
with open(f"results/CNNCTC-test-{SAVE_NAME}.txt", "w") as f:
    f.write(f"Final testing accuracy: {mean_acc:.4f}\n")
    f.write(f"Final testing character accuracy: {mean_char_acc:.4f}\n")

### Test pipeline baseline method

Here we implement the pipeline testing, that tests the combination between the two parts for each method, in this code, the baseline so the function that computes the plate box using traditional methods then crops the images and does character recognition using cnn ctc model.

The function plate_dectector is defined in utils

In [None]:
import time
reader = easyocr.Reader(['ch_sim', 'en'])

CHAR_LIST = sorted(set(PROVINCES+ALPHABETS+ADS))
PLATE_LENGTH = 8

for idx, char in enumerate(CHAR_LIST):
    CHAR_IDX[char] = idx + 1  # start from 1
    IDX_CHAR[idx + 1] = char
IDX_CHAR[0] = '_'  # blank character for CTC

NUM_CHAR = len(CHAR_LIST) + 1 #since we include the blank character

#Hyperparameters
BATCH_SIZE = 32
LR = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 60
SAVE_NAME = f"n_epochs_{NUM_EPOCHS}_bs_{BATCH_SIZE}_LR_{LR}_wd_{WEIGHT_DECAY}"

cnn_ctc_model = CNN_CTC_model(num_char=NUM_CHAR, hidden_size=256)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
preprocess = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((48, 144)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

preprocess_dataset = transforms.Compose([
    transforms.ToTensor()
])

#Load the cnnctc model for the second part
print(f"models/CNNCTC-{SAVE_NAME}.pth")

if os.path.exists(f"models/CNNCTC-{SAVE_NAME}.pth"):
    cnn_ctc_model.load_state_dict(torch.load(f"models/CNNCTC-{SAVE_NAME}.pth"))
    cnn_ctc_model.to(device)
else:
    print("model not found. Please train the model first")

#Initialize the path for the languages
image_paths = Path(f"dataset/images/test")

evaluator = Evaluator(idx2char=IDX_CHAR)
cnn_ctc_model.eval()
plate_accuracies = []
char_accuracies = []
iou_scores = []

i=0
for image_path in image_paths.glob("*.jpg"):
    if i % 10 == 0:
        print(f"processing image {i+1}/{55000}")
    image_name = image_path.name
    plate_label_path = Path(f"dataset/labels_pdlpr/test/") / (image_path.stem + ".txt")

    with open(plate_label_path, "r", encoding="utf-8") as f:
        pdlpr_plate_str = f.readline().strip()
    #print(pdlpr_plate_str)

    fields = image_path.stem.split("-")
    bbox_part = fields[2]
    corners = bbox_part.split("_")
    x1, y1 = map(int, corners[0].split("&"))
    x2, y2 = map(int, corners[1].split("&"))

    true_box = [x1, y1, x2, y2]
    #print(true_box)
    plate_number = fields[4]
    character_id_list = plate_number.split("_")
    plate_id = []
    for c in character_id_list:
        plate_id.append(int(c))

    #converting the index from the name to the index from the
    #unified vocabulary
    plate_id= target_to_index(plate_id)
    #print(plate_id)

    true_plate_idx = torch.tensor(plate_id, dtype=torch.long).to(device)

    image = Image.open(image_path).convert("RGB")
    bbx =[]
    result_detector = plate_detector(image_path, true_box)
    if result_detector is None:
        print(f"image {i}: No plate detected")
        i+=1
        continue

    bbx, text = result_detector
    print(i, bbx, text)
    iou = compute_iou(bbx, true_box)
    print(f"image {i}: iou score {iou}")
    iou_scores.append(iou)
    x1,y1, x2, y2 = bbx
    cropped_image = image.crop((x1, y1, x2, y2))

    #import matplotlib.pyplot as plt
    plt.imshow(cropped_image, cmap="gray")
    plt.title("Cropped Plate")
    plt.axis("off")
    #plt.show()

    processed_image =preprocess(cropped_image).unsqueeze(0).to(device)
    with torch.no_grad():
        #computing the predictions wiht the cnn ctc model
        logits_model_output = cnn_ctc_model(processed_image)
        evaluator.reset()
        evaluator.update_baseline(logits_model_output, [true_plate_idx])
        metrics = evaluator.compute()

        char_acc = metrics["char_accuracy"]
        plate_acc = metrics["seq_accuracy"]
        char_accuracies.append(char_acc)
        plate_accuracies.append(plate_acc)
        print(f"  Char acc: {char_acc:.2f}, Seq acc: {plate_acc:.2f}\n")
        plate_prediction = evaluator.greedy_decode_idx(logits_model_output)[0]
        print(plate_prediction[:8])
        plate_string = index_to_target(plate_prediction[:8])
        print(f"predicted_plate: {''.join(plate_string)}, original plate: {pdlpr_plate_str}")

    i+=1

mean_char_acc = sum(char_accuracies) / len(char_accuracies)
mean_plate_acc = sum(plate_accuracies)/len(plate_accuracies)
mean_iou = sum(iou_scores)/len(iou_scores)
print(f"Pipeline test result plate accuracy: {mean_plate_acc:.4f}")
print(f"Pipeline test result char accuracy: {mean_char_acc:.4f}")
print(f"Pipeline test result iou score: {mean_iou:.4f}")
#saving the iou result of the training, validation (last step) and testing
with open(f"results/pipeline-baseline-test-{SAVE_NAME}.txt", "w") as f:
    f.write(f"Final testing plate accuracy: {mean_plate_acc:.4f}\n")
    f.write(f"Final testing character accuracy: {mean_char_acc:.4f}\n")
    f.write(f"Final testing iou score: {mean_iou:.4f}\n")