# DETECTION
In this section, we first import the necessary libraries. PyTorch is a tool used for creating and training neural networks. NumPy and Pandas are used to efficiently manage large datasets and perform numerical operations. Torchvision.transforms and PIL (Pillow) handle the tasks of loading and pre-processing images (sizing, normalizing). Finally, Matplotlib and Tqdm provide auxiliary functions such as visualization and progress bars.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF
from torchvision.ops import box_iou, nms
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from sklearn.cluster import KMeans
import time
import copy
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import zipfile
import os
import random
import xml.etree.ElementTree as ET
import cv2
import warnings
warnings.filterwarnings('ignore')

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
set_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'\n✓ Device: {device}')
if torch.cuda.is_available():
    print(f'✓ GPU: {torch.cuda.get_device_name(0)}')


✓ Device: cuda
✓ GPU: Tesla T4


In [None]:
zip_path = '/content/neu-surface.zip'
extract_path = '/content/neu_dataset'

if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

dataset_base = os.path.join(extract_path, 'NEU-DET')

In object detection projects, labeling information is typically found in XML files in Pascal VOC format. This function (parse_voc_xml) reads these XML files and parses the structural information within them. For each file, it extracts the image name, its original width and height, and, most importantly, the class label and bounding box coordinates (xmin, ymin, xmax, ymax) of each defect in the image. This is the first step in transforming the original text data into a structured list that the model can understand.

In [None]:
def parse_voc_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Image information
    size = root.find('size')
    width = int(size.find('width').text)
    height = int(size.find('height').text)

    filename = root.find('filename').text

    # Bounding boxes
    boxes = []
    labels = []

    for obj in root.findall('object'):
        label = obj.find('name').text
        bbox = obj.find('bndbox')

        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)

        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(label)
    return {
        'filename': filename,
        'width': width,
        'height': height,
        'boxes': boxes,
        'labels': labels
    }

# Test: Parse an XML file
dataset_base = '/content/neu_dataset/NEU-DET'
ann_dir = os.path.join(dataset_base, 'train', 'annotations')

sample_xml_files = [f for f in os.listdir(ann_dir) if f.endswith('.xml')]
if len(sample_xml_files) > 0:
    sample_xml = os.path.join(ann_dir, sample_xml_files[0])
    sample_annotation = parse_voc_xml(sample_xml)
    print(f"  Filename: {sample_annotation['filename']}")
    print(f"  Image size: {sample_annotation['width']}x{sample_annotation['height']}")
    print(f"  Number of objects: {len(sample_annotation['boxes'])}")
    print(f"  Labels: {sample_annotation['labels']}")
    print(f"  Boxes: {sample_annotation['boxes']}")

  Filename: rolled-in_scale_71.jpg
  Image size: 200x200
  Number of objects: 2
  Labels: ['rolled-in_scale', 'rolled-in_scale']
  Boxes: [[81, 1, 187, 82], [79, 142, 135, 171]]


The `load_detection_dataset` function loads all structural information of the dataset into memory by scanning the specified index structure, reading each XML file, and matching the path to the corresponding image file. During this process, it automatically identifies all unique error classes in the dataset and assigns an integer index to each class. It also skips instances that throw errors or lack valid tags during loading. As a result, lists containing image paths, box coordinates, and class indices are obtained, ready for use in training and validation sets.

In [None]:
def load_detection_dataset(base_path, split='train'):
    annotations_path = os.path.join(base_path, split, 'annotations')
    images_path = os.path.join(base_path, split, 'images')

    if not os.path.exists(annotations_path):
        print(f"ERROR: {annotations_path} not found!")
        return None, None

    print(f"\n{split.upper()} set loading...")
    print(f"  Annotations: {annotations_path}")
    print(f"  Images: {images_path}")

    # List all XML files
    xml_files = [f for f in os.listdir(annotations_path) if f.endswith('.xml')]

    print(f"  Found {len(xml_files)} XML files")

    # We extract class names from file names
    all_classes = set()
    for xml_file in xml_files:
        class_name = '_'.join(xml_file.replace('.xml', '').split('_')[:-1])
        all_classes.add(class_name)

    class_names = sorted(list(all_classes))
    class_to_idx = {class_name: idx for idx, class_name in enumerate(class_names)}

    print(f"  Classes detected: {class_names}")

    dataset = []
    class_counts = {name: 0 for name in class_names}
    # Load any XML file
    for xml_file in xml_files:
        xml_path = os.path.join(annotations_path, xml_file)

        try:
            # Parce XML
            ann = parse_voc_xml(xml_path)

            # Remove class from file name
            class_name = '_'.join(xml_file.replace('.xml', '').split('_')[:-1])

            img_name = ann['filename']

            # Image location
            img_path = os.path.join(images_path, class_name, img_name)

            if not os.path.exists(img_path):
                # Try different extensions
                img_base = os.path.splitext(img_name)[0]
                img_path = None
                for ext in ['.bmp', '.jpg', '.png', '.jpeg']:
                    candidate = os.path.join(images_path, class_name, img_base + ext)
                    if os.path.exists(candidate):
                        img_path = candidate
                        break

                if img_path is None:
                    continue

             # Convert labels to index
            label_indices = []
            for label in ann['labels']:
                if label in class_to_idx:
                    label_indices.append(class_to_idx[label])

            if len(label_indices) == 0:
                continue

            dataset.append({
                'image_path': img_path,
                'boxes': torch.FloatTensor(ann['boxes']),  # [N, 4]
                'labels': torch.LongTensor(label_indices),  # [N]
                'width': ann['width'],
                'height': ann['height'],
                'class_name': class_name
            })

            class_counts[class_name] += 1

        except Exception as e:
            print(f"  Warning: Error loading {xml_file}: {e}")
            continue

    print(f"\n  Loaded samples per class:")
    for class_name in class_names:
        print(f"    {class_name}: {class_counts[class_name]}")

    return dataset, class_names
# Load train and validation sets
train_dataset, class_names = load_detection_dataset(dataset_base, 'train')
val_dataset, _ = load_detection_dataset(dataset_base, 'validation')

if train_dataset is not None:
    print(f"\n✓ Train samples: {len(train_dataset)}")
    print(f"✓ Validation samples: {len(val_dataset)}")
    print(f"✓ Classes: {class_names}")


TRAIN set loading...
  Annotations: /content/neu_dataset/NEU-DET/train/annotations
  Images: /content/neu_dataset/NEU-DET/train/images
  Found 1439 XML files
  Classes detected: ['crazing', 'inclusion', 'patches', 'pitted_surface', 'rolled-in_scale', 'scratches']

  Loaded samples per class:
    crazing: 239
    inclusion: 240
    patches: 240
    pitted_surface: 240
    rolled-in_scale: 240
    scratches: 240

VALIDATION set loading...
  Annotations: /content/neu_dataset/NEU-DET/validation/annotations
  Images: /content/neu_dataset/NEU-DET/validation/images
  Found 361 XML files
  Classes detected: ['crazing', 'inclusion', 'patches', 'pitted_surface', 'rolled-in_scale', 'scratches']

  Loaded samples per class:
    crazing: 60
    inclusion: 60
    patches: 60
    pitted_surface: 60
    rolled-in_scale: 60
    scratches: 60

✓ Train samples: 1439
✓ Validation samples: 360
✓ Classes: ['crazing', 'inclusion', 'patches', 'pitted_surface', 'rolled-in_scale', 'scratches']


Understanding the characteristics of the dataset before starting the training is crucial for model selection. This analysis examines the total number of boxes, the average number of errors per image, and the distribution of these errors. Additionally, the number of times each error class recurs (class imbalance) and the average area, width, and height of the detection boxes are calculated. These statistics help us use methods like weighting if there are too few or too few classes that would strain the model.

In [None]:
def analyze_annotations(dataset, class_names):
        total_boxes = 0
        boxes_per_image = []
        class_counts = {i: 0 for i in range(len(class_names))}
        box_sizes = []

        for sample in dataset:
            n_boxes = len(sample['boxes'])
            total_boxes += n_boxes
            boxes_per_image.append(n_boxes)

            # Class distribution
            for label in sample['labels']:
                class_counts[label.item()] += 1

            # Box dimensions
            boxes = sample['boxes']
            widths = boxes[:, 2] - boxes[:, 0]
            heights = boxes[:, 3] - boxes[:, 1]
            areas = widths * heights
            box_sizes.extend(areas.tolist())

        print(f"\nTotal images: {len(dataset)}")
        print(f"Total boxes: {total_boxes}")
        print(f"Boxes per image: {np.mean(boxes_per_image):.2f} ± {np.std(boxes_per_image):.2f}")
        print(f"Min boxes: {min(boxes_per_image)}, Max boxes: {max(boxes_per_image)}")

        print(f"\nClass distribution:")
        for i, class_name in enumerate(class_names):
            print(f"  {class_name}: {class_counts[i]} boxes")

        print(f"\nBox sizes (area):")
        print(f"  Mean: {np.mean(box_sizes):.1f} pixels²")
        print(f"  Std: {np.std(box_sizes):.1f} pixels²")
        print(f"  Min: {min(box_sizes):.1f}, Max: {max(box_sizes):.1f}")

analyze_annotations(train_dataset, class_names)


Total images: 1439
Total boxes: 3332
Boxes per image: 2.32 ± 1.28
Min boxes: 1, Max boxes: 9

Class distribution:
  crazing: 524 boxes
  inclusion: 852 boxes
  patches: 688 boxes
  pitted_surface: 345 boxes
  rolled-in_scale: 496 boxes
  scratches: 427 boxes

Box sizes (area):
  Mean: 6856.0 pixels²
  Std: 7633.7 pixels²
  Min: 108.0, Max: 39601.0


The NEUDetectionDataset class uses PyTorch's data loading and preprocessing logic. The `__getitem__` method, when called, loads the image from disk and retrieves the corresponding box/label data. Images are resized to the specified input dimensions (416x416), and box coordinates are normalized to this size (set to a range of 0-1). Additionally, if used for training, arbitrary data augmentation techniques (rotation, brightness adjustment) are applied to images and boxes to improve the model's generalization capabilities.

In [None]:
class NEUDetectionDataset(Dataset):
    def __init__(self, dataset, class_names, img_size=416, augment=False):
        self.dataset = dataset
        self.class_names = class_names
        self.img_size = img_size
        self.augment = augment
        self.num_classes = len(class_names)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]

        # Load image
        image = Image.open(sample['image_path']).convert('RGB')
        boxes = sample['boxes'].clone()  # [N, 4]
        labels = sample['labels'].clone()  # [N]

        original_width = sample['width']
        original_height = sample['height']

        # Apply augmentation if training
        if self.augment:
            image, boxes = self.apply_augmentation(image, boxes, original_width, original_height)

        # Resize image
        image = image.resize((self.img_size, self.img_size), Image.BILINEAR)

        # Normalize boxes to [0, 1] relative to ORIGINAL size
        if len(boxes) > 0:
            boxes[:, [0, 2]] /= original_width   # x coordinates
            boxes[:, [1, 3]] /= original_height  # y coordinates
            boxes = torch.clamp(boxes, 0, 1)

        # Convert to tensor
        image = TF.to_tensor(image)

        # Normalize
        image = TF.normalize(image, mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
        return image, boxes, labels

    def apply_augmentation(self, image, boxes, width, height):
        if random.random() > 0.4:
            scale_factor = random.uniform(1.0, 1.1)
            new_width = int(width * scale_factor)
            new_height = int(height * scale_factor)

            image_scaled = image.resize((new_width, new_height), Image.BILINEAR)

            max_x = new_width - width
            max_y = new_height - height
            crop_x = random.randint(0, max_x)
            crop_y = random.randint(0, max_y)

            image = image_scaled.crop((crop_x, crop_y, crop_x + width, crop_y + height))

            if len(boxes) > 0:
                boxes *= scale_factor

                boxes[:, [0, 2]] -= crop_x
                boxes[:, [1, 3]] -= crop_y

                boxes = torch.clamp(boxes, 0, width)
                valid_mask = (boxes[:, 2] - boxes[:, 0] > 1) & (boxes[:, 3] - boxes[:, 1] > 1)
                boxes = boxes[valid_mask]
                if boxes.numel() == 0:
                     boxes = torch.empty((0, 4), dtype=torch.float32)

        # Horizontal flip
        if random.random() > 0.5:
            image = TF.hflip(image)
            if len(boxes) > 0:
                boxes[:, [0, 2]] = width - boxes[:, [2, 0]]
        # Vertical flip
        if random.random() > 0.5:
            image = TF.vflip(image)
            if len(boxes) > 0:
                boxes[:, [1, 3]] = height - boxes[:, [3, 1]]
        # Brightness & contrast
        if random.random() > 0.3:
            image = TF.adjust_brightness(image, random.uniform(0.85, 1.15))
            image = TF.adjust_contrast(image, random.uniform(0.85, 1.15))

        if random.random() > 0.3:
            image = TF.adjust_gamma(image, gamma=random.uniform(0.8, 1.2))
        if random.random() > 0.3:
            image = TF.gaussian_blur(image, kernel_size=3)
        if random.random() > 0.3:
           image = TF.adjust_saturation(image, random.uniform(0.85, 1.15))

        return image, boxes


Deep learning models typically process data in batches, but each image in a batch might have a different number of error boxes. The `detection_collate_fn` function solves this problem by stacking all images in a batch while keeping the lists of boxes and tags of varying lengths as separate Python lists.

In [None]:
def detection_collate_fn(batch):
    images = []
    boxes = []
    labels = []

    for image, box, label in batch:
        images.append(image)
        boxes.append(box)
        labels.append(label)

    # Stack images
    images = torch.stack(images, dim=0)

    return images, boxes, labels

In this step, using the NEUDetectionDataset class we defined, two separate sets are created from the actual data lists. For the train_det_dataset, augment=True is set, ensuring the model sees slightly modified versions of the images in each training round. For the val_det_dataset, augment=False is set; the images are not subjected to any random changes since a validation test is being performed. This distinction is used to accurately measure the model's true performance and generalization ability.

In [None]:
IMG_SIZE = 416
# Create train dataset (with augmentation)
train_det_dataset = NEUDetectionDataset(
    dataset=train_dataset,
    class_names=class_names,
    img_size=IMG_SIZE,
    augment=True,  # Training augmentation
)

# Create validation dataset (no augmentation)
val_det_dataset = NEUDetectionDataset(
    dataset=val_dataset,
    class_names=class_names,
    img_size=IMG_SIZE,
    augment=False  # No augmentation for validation
)

print(f"\n✓ Train Detection Dataset: {len(train_det_dataset)} images")
print(f"✓ Validation Detection Dataset: {len(val_det_dataset)} images")
print(f"✓ Image size: {IMG_SIZE}x{IMG_SIZE}")
print(f"✓ Augmentation: Horizontal flip, Vertical flip, Color jitter")


✓ Train Detection Dataset: 1439 images
✓ Validation Detection Dataset: 360 images
✓ Image size: 416x416
✓ Augmentation: Horizontal flip, Vertical flip, Color jitter


DataLoader objects are responsible for efficiently extracting data packets (batches) from the dataset. Setting BATCH_SIZE=8 means the model will process 8 images at each step. For the training loader, setting shuffle=True prevents the model from memorizing. The settings pin_memory=True and num_workers=0 (due to environmental constraints) attempt to optimize data transfer. These loaders ensure the training cycle receives data quickly.

In [None]:
BATCH_SIZE = 8
NUM_WORKERS = 0

train_det_loader = DataLoader(
    train_det_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    collate_fn=detection_collate_fn,
    pin_memory=True if torch.cuda.is_available() else False
)

val_det_loader = DataLoader(
    val_det_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=detection_collate_fn,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"\nTrain DataLoader:")
print(f"  - Batch size: {BATCH_SIZE}")
print(f"  - Total batches: {len(train_det_loader)}")
print(f"  - Augmentation: ON")

print(f"\nValidation DataLoader:")
print(f"  - Batch size: {BATCH_SIZE}")
print(f"  - Total batches: {len(val_det_loader)}")
print(f"  - Augmentation: OFF")


Train DataLoader:
  - Batch size: 8
  - Total batches: 180
  - Augmentation: ON

Validation DataLoader:
  - Batch size: 8
  - Total batches: 45
  - Augmentation: OFF


Before training the model, a test is performed to ensure the data pipeline is working correctly. The first data packet is retrieved from the training loader using the `next(iter(train_det_loader))` command. The dimensions of the retrieved images are checked to ensure they are [8, 3, 416, 416] (batch, channel, height, width). Additionally, the bin coordinates and label indices for each image within the packet are examined; the bins should be normalized between 0 and 1. This step helps detect data formatting errors before training begins.

In [None]:
# Test dataloader
# Get one batch
images, boxes_list, labels_list = next(iter(train_det_loader))

print(f"  - Images shape: {images.shape}")
print(f"  - Number of images: {len(boxes_list)}")

for i in range(min(3, len(boxes_list))):
    print(f"\n  Image {i+1}:")
    print(f"    - Number of objects: {len(boxes_list[i])}")
    if len(boxes_list[i]) > 0:
        print(f"    - Box shape: {boxes_list[i].shape}")
        print(f"    - Labels: {labels_list[i].tolist()}")
        print(f"    - Boxes (normalized): {boxes_list[i][:2].tolist()}")  # First 2 boxes


  - Images shape: torch.Size([8, 3, 416, 416])
  - Number of images: 8

  Image 1:
    - Number of objects: 3
    - Box shape: torch.Size([3, 4])
    - Labels: [2, 2, 2]
    - Boxes (normalized): [[0.2199999988079071, 0.699999988079071, 0.4650000035762787, 0.9900000095367432], [0.7049999833106995, 0.6349999904632568, 0.9150000214576721, 0.8550000190734863]]

  Image 2:
    - Number of objects: 2
    - Box shape: torch.Size([2, 4])
    - Labels: [1, 1]
    - Boxes (normalized): [[0.2150000035762787, 0.1850000023841858, 0.38999998569488525, 0.7850000262260437], [0.8100000023841858, 0.3700000047683716, 0.925000011920929, 0.574999988079071]]

  Image 3:
    - Number of objects: 1
    - Box shape: torch.Size([1, 4])
    - Labels: [5]
    - Boxes (normalized): [[0.7482237815856934, 0.3466154932975769, 0.8174178600311279, 1.0]]


The DefectDetector model is essentially a convolutional neural network (CNN) architecture. The DetectionBackbone consists of successive convolution and pooling layers; it hierarchically extracts complex features such as imperfections and textures from the image. This architecture reduces the input from a 416x416 grid to a 13x13 feature map. DetectionHead takes this condensed feature map and makes the final predictions necessary for object detection: box coordinates, object confidence score, and class probabilities. The model makes predictions based on multiple anchor boxes for each 13x13 grid cell.

In [None]:
class DetectionBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        # 416 -> 208
        self.block1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm2d(32), nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        # 208 -> 104
        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64), nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        # 104 -> 52
        self.block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        # 52 -> 26
        self.block4 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256), nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        self.dropout_4 = nn.Dropout(p=0.2)
        # 26 -> 13
        self.block5 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512), nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )

    def forward(self, x):
        x = self.block1(x)  # 208
        x = self.block2(x)  # 104
        x = self.block3(x)  # 52
        x = self.block4(x)  #26
        x = self.dropout_4(x)
        x = self.block5(x)  # 13
        return x

class DetectionHead(nn.Module):
    def __init__(self, in_channels=512, num_classes=6, num_anchors=6):
        super().__init__()
        self.num_classes = num_classes
        out_channels = num_anchors * (5 + num_classes)
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256), nn.ReLU(inplace=True),
            nn.Dropout(p=0.2),
            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128), nn.ReLU(inplace=True),
            nn.Conv2d(128, out_channels, kernel_size=1)
        )

    def forward(self, x):
        out = self.conv(x)
        B, OC, H, W = out.shape
        A = OC // (5 + self.num_classes)
        out = out.view(B, A, 5 + self.num_classes, H, W)
        return out.permute(0, 1, 3, 4, 2).contiguous()  # (B, A, H, W, 5+C)

class DefectDetector(nn.Module):
    def __init__(self, num_classes, num_anchors=6, anchors=None):
        super().__init__()
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.backbone = DetectionBackbone()
        self.detection_head = DetectionHead(in_channels=512, num_classes=num_classes, num_anchors=num_anchors)
        if anchors is not None:
            self.anchors = anchors
        else:
            self.anchors = torch.FloatTensor([[0.1, 0.1], [0.2, 0.2], [0.4, 0.4]])

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None: nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                if m.weight is not None: nn.init.constant_(m.weight, 1)
                if m.bias is not None: nn.init.constant_(m.bias, 0)

    def forward(self, x):
        feats = self.backbone(x)
        preds = self.detection_head(feats)
        return preds

An instance of the DefectDetector class model is created and moved to the device (GPU or CPU) designated for training. In this step, the model's structure is checked, and the total number of parameters and the number of trainable parameters are calculated. These numbers provide information about the model's complexity and memory/processing power requirements. For example, 6.19 million parameters indicates that the model is of medium size by deep learning standards.

In [None]:
model = DefectDetector(
    num_classes=len(class_names),
    num_anchors=6
)

model = model.to(device)

# Model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"  - Total parameters: {total_params:,}")
print(f"  - Trainable parameters: {trainable_params:,}")
print(f"  - Model size: {total_params*4/1024/1024:.2f} MB")

  - Total parameters: 6,190,722
  - Trainable parameters: 6,190,722
  - Model size: 23.62 MB


A dummy input (torch.randn(2, 3, 416, 416)) is used to check the model's logical output. This input allows the model to complete a training step. The dimensions of the output tensor are checked: [2, 6, 6, 6, 11]. These dimensions confirm that there are 2 images, 6 anchor boxes, a 6x6 grid size, and 11 predictor values ​​for each box (4 coordinates, 1 confidence score, 6 class probabilities). This test is important for checking the output fit between the model and the loss function.

In [None]:
# Create dummy input
test_input = torch.randn(2, 3, 416, 416).to(device)
print(f"\nTest input shape: {test_input.shape}")

# Forward pass
model.eval()
with torch.no_grad():
    predictions = model(test_input)

print(f"Predictions shape: {predictions.shape}")
print(f"  - Format: (batch, anchors, grid_h, grid_w, 4+classes)")
print(f"  - Batch: {predictions.shape[0]}")
print(f"  - Anchors: {predictions.shape[1]}")
print(f"  - Grid: {predictions.shape[2]}x{predictions.shape[3]}")
print(f"  - Predictions per cell: {predictions.shape[4]} (4 box + 1 obj + {len(class_names)} classes)")


Test input shape: torch.Size([2, 3, 416, 416])
Predictions shape: torch.Size([2, 6, 6, 6, 11])
  - Format: (batch, anchors, grid_h, grid_w, 4+classes)
  - Batch: 2
  - Anchors: 6
  - Grid: 6x6
  - Predictions per cell: 11 (4 box + 1 obj + 6 classes)


In YOLO-like detection models, the dimensions of predefined anchor boxes directly affect the model's performance. The get_anchors function uses a K-Means clustering algorithm by summing the widths and heights of all actual fault boxes in the training dataset. This algorithm finds K=6 anchor boxes (width and height ratios) that best represent the shapes of the faults in the dataset.

In [None]:
def get_anchors(dataset, k=6, img_size=416):
    wh = []
    for sample in dataset:
        boxes = sample['boxes']
        widths = (boxes[:,2] - boxes[:,0]) * img_size
        heights = (boxes[:,3] - boxes[:,1]) * img_size
        for w,h in zip(widths, heights):
            wh.append([w.item(), h.item()])
    kmeans = KMeans(n_clusters=k, random_state=0).fit(wh)
    anchors = kmeans.cluster_centers_ / img_size
    return torch.tensor(anchors, dtype=torch.float32)

def suggest_anchors(dataset, img_size=416):
    widths, heights = [], []
    for sample in dataset:
        boxes = sample['boxes']
        widths.extend(((boxes[:,2] - boxes[:,0]) * img_size).tolist())
        heights.extend(((boxes[:,3] - boxes[:,1]) * img_size).tolist())
    print("Mean width:", np.mean(widths), "Mean height:", np.mean(heights))
    print("Min width:", np.min(widths), "Max width:", np.max(widths))
    print("Min height:", np.min(heights), "Max height:", np.max(heights))

anchors = get_anchors(train_dataset, k=6)
print("Optimized anchors:", anchors)

suggest_anchors(train_dataset)

model = DefectDetector(num_classes=len(class_names), num_anchors=6, anchors=anchors)
model = model.to(device)

Optimized anchors: tensor([[ 44.4576, 184.1897],
        [ 32.0082,  48.9360],
        [163.2500, 183.7593],
        [161.4216,  65.2648],
        [ 91.6648,  67.4757],
        [ 48.7271, 111.8719]])
Mean width: 27657.7575030012 Mean height: 40685.47418967587
Min width: 3328.0 Max width: 82784.0
Min height: 3744.0 Max height: 82784.0


In this step, the raw prediction values ​​produced by the model in (tx, ty, tw, th) format are converted into real coordinates on the image. The decode_predictions function converts the predictions into normalized (x1, y1, x2, y2) coordinates using grid cell offsets and anchor dimensions with the help of sigmoid and exponential functions. It also multiplies the object confidence score by the class probability to find the final confidence score and retains predictions above a defined threshold.

In [None]:
def decode_predictions(predictions, anchors, confidence_threshold=0.2):
    B, A, GH, GW, D = predictions.shape
    anchors = anchors.to(predictions.device)
    all_boxes, all_scores, all_labels = [], [], []

    for b in range(B):
        pred = predictions[b]
        box_pred = pred[..., :4]
        obj_pred = pred[..., 4]
        class_pred = pred[..., 5:]

        obj_score = torch.sigmoid(obj_pred)
        class_scores = F.softmax(class_pred, dim=-1)
        max_scores, class_indices = torch.max(class_scores, dim=-1)
        conf = obj_score * max_scores

        gy, gx = torch.meshgrid(
            torch.arange(GH, device=predictions.device),
            torch.arange(GW, device=predictions.device),
            indexing='ij'
        )

        boxes, scores, labels = [], [], []
        for a in range(A):
            bx = (torch.sigmoid(box_pred[a, ..., 0]) + gx.float()) / GW
            by = (torch.sigmoid(box_pred[a, ..., 1]) + gy.float()) / GH
            bw = torch.exp(box_pred[a, ..., 2]) * anchors[a, 0]
            bh = torch.exp(box_pred[a, ..., 3]) * anchors[a, 1]

            x1 = bx - bw / 2
            y1 = by - bh / 2
            x2 = bx + bw / 2
            y2 = by + bh / 2

            box_coords = torch.stack([x1, y1, x2, y2], dim=-1).view(-1, 4)
            conf_a = conf[a].view(-1)
            cls_a = class_indices[a].view(-1)

            mask = conf_a > confidence_threshold
            if mask.any():
                boxes.append(box_coords[mask])
                scores.append(conf_a[mask])
                labels.append(cls_a[mask])

        if len(boxes) > 0:
            all_boxes.append(torch.cat(boxes, dim=0))
            all_scores.append(torch.cat(scores, dim=0))
            all_labels.append(torch.cat(labels, dim=0))
        else:
            dev = predictions.device
            all_boxes.append(torch.zeros((0, 4), device=dev))
            all_scores.append(torch.zeros((0,), device=dev))
            all_labels.append(torch.zeros((0,), dtype=torch.long, device=dev))

    return all_boxes, all_scores, all_labels

def apply_nms(boxes, scores, labels, iou_threshold=0.5):
    if len(boxes) == 0:
        return boxes, scores, labels
    keep_b, keep_s, keep_l = [], [], []
    for cls in labels.unique(sorted=True):
        mask = labels == cls
        cb, cs = boxes[mask], scores[mask]
        if cb.numel() == 0: continue
        idx = nms(cb, cs, iou_threshold)
        keep_b.append(cb[idx]); keep_s.append(cs[idx])
        keep_l.append(torch.full((len(idx),), int(cls.item()), dtype=torch.long, device=labels.device))
    if len(keep_b) == 0:
        dev = labels.device
        return torch.zeros((0, 4), device=dev), torch.zeros((0,), device=dev), torch.zeros((0,), dtype=torch.long, device=dev)
    return torch.cat(keep_b), torch.cat(keep_s), torch.cat(keep_l)


The DetectionLoss class measures the error between the values ​​predicted by the model and the actual labels. Total loss is a weighted sum of three main components: Box Loss, which uses MSE for the error in correctly finding coordinates; Object Loss, which uses BCE for the error in correctly predicting whether a box contains an object; and Class Loss, which uses Cross-Entropy for the error in correctly predicting the type of error. Hyperparameters like lambda_coord and lambda_noobj adjust the importance of the loss components, while class weights attempt to correct unbalanced data.

In [None]:
class DetectionLoss(nn.Module):
    def __init__(self, num_classes, anchors, lambda_coord=5.0, lambda_noobj=0.3, class_weights=None):
        super().__init__()
        self.num_classes = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.mse_loss = nn.MSELoss(reduction='sum')
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='sum')
        self.ce_loss = nn.CrossEntropyLoss(reduction='sum')
        self.anchors = anchors

        self.ce_loss = nn.CrossEntropyLoss(
            weight=class_weights.to(device) if class_weights is not None else None,
            reduction='sum'
        )

    def forward(self, predictions, targets_boxes, targets_labels):
        B, A, GH, GW, D = predictions.shape
        assert D == 5 + self.num_classes, f"Prediction last dim {D} != 5+{self.num_classes}"
        device = predictions.device
        anchors = self.anchors.to(device)

        box_loss = torch.tensor(0.0, device=device)
        obj_loss = torch.tensor(0.0, device=device)
        cls_loss = torch.tensor(0.0, device=device)

        for b in range(B):
            pred = predictions[b]  # (A, GH, GW, 5+C)
            t_boxes = targets_boxes[b].to(device)
            t_labels = targets_labels[b].to(device)

            if t_boxes.size(0) == 0:
                obj_loss += self.lambda_noobj * self.bce_loss(pred[..., 4], torch.zeros_like(pred[..., 4]))
                continue

            # centers in grid
            cx = ((t_boxes[:, 0] + t_boxes[:, 2]) / 2.0) * GW
            cy = ((t_boxes[:, 1] + t_boxes[:, 3]) / 2.0) * GH
            gj = cx.clamp(0, GW - 1).long()
            gi = cy.clamp(0, GH - 1).long()

            gt_w = (t_boxes[:, 2] - t_boxes[:, 0])  # fractions
            gt_h = (t_boxes[:, 3] - t_boxes[:, 1])

            obj_mask = torch.zeros(A, GH, GW, device=device, dtype=torch.bool)
            noobj_mask = torch.ones(A, GH, GW, device=device, dtype=torch.bool)

            for t in range(t_boxes.size(0)):
                i, j = gi[t], gj[t]
                # best anchor by size
                diff = (torch.log(gt_w[t] / (anchors[:, 0] + 1e-9)).abs()
                        + torch.log(gt_h[t] / (anchors[:, 1] + 1e-9)).abs())
                best_anchor = int(torch.argmin(diff))

                obj_mask[best_anchor, i, j] = True
                noobj_mask[best_anchor, i, j] = False

                pred_vec = pred[best_anchor, i, j]
                pred_box = pred_vec[:4]           # (tx, ty, tw, th)
                pred_cls = pred_vec[5:]
                target_cls = t_labels[t].long()

                # xy offsets in cell
                off_x = cx[t] - j.float()
                off_y = cy[t] - i.float()
                target_xy = torch.stack([off_x, off_y], dim=0)
                pred_xy = torch.sigmoid(pred_box[:2])

                # wh using anchors (fractions)
                pred_w = torch.exp(pred_box[2]) * anchors[best_anchor, 0]
                pred_h = torch.exp(pred_box[3]) * anchors[best_anchor, 1]
                pred_wh = torch.stack([pred_w, pred_h], dim=0)
                target_wh = torch.stack([gt_w[t], gt_h[t]], dim=0)

                box_loss += self.mse_loss(pred_xy, target_xy) + self.mse_loss(pred_wh, target_wh)
                cls_loss += self.ce_loss(pred_cls.unsqueeze(0), target_cls.unsqueeze(0))

            obj_pred = pred[..., 4]
            if obj_mask.any():
                obj_loss += self.bce_loss(obj_pred[obj_mask], torch.ones_like(obj_pred[obj_mask]))
            if noobj_mask.any():
                obj_loss += self.lambda_noobj * self.bce_loss(obj_pred[noobj_mask], torch.zeros_like(obj_pred[noobj_mask]))

        total = (self.lambda_coord * box_loss + obj_loss + cls_loss) / B
        return total, {'total': total.item(), 'box': (box_loss / B).item(), 'obj': (obj_loss / B).item(), 'cls': (cls_loss / B).item()}


The Optimizer (Adam) updates the model's weights based on feedback from the loss function. The Learning Rate (0.001) determines how many steps the model will take in each update. The Scheduler (ReduceLROnPlateau) automatically reduces the learning rate (factor=0.5) if the validation loss does not improve over a certain number of rounds (patience=3), allowing the model to fine-tune. Finally, class weights are included in the Cross-Entropy loss to allow the model to focus more on less represented error classes.

In [None]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 5e-4

optimizer = optim.Adam(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

scheduler = lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=3,
    min_lr=1e-6
)
class_weights = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.5, 2.0], dtype=torch.float32)
criterion = DetectionLoss(
    num_classes=len(class_names),
    anchors=model.anchors,
    lambda_coord=5.0,
    lambda_noobj=0.3,
    class_weights=class_weights
)

The `train_one_epoch` function completes a training run by iterating through all batches in the training dataset. In each batch: images are moved to the GPU, optimizer gradients are reset, predictions are extracted from the model, loss is calculated, gradients are propagated back (loss.backward()), and model weights are updated (optimizer.step()). This function tracks the model's learning progress by returning the average values ​​of the training loss (box, object, class). The validate function measures the model's performance on the validation dataset, which it didn't see during training. At this stage, the model is put into model.eval() mode, and gradient calculation is disabled (torch.no_grad()), thus saving memory and preventing weight updates. Just like in the training cycle, the average loss (total, box, obj, cls) across all packets in the validation set is calculated. This loss value is a metric indicating whether the model has memorized (its generalization ability).

In [None]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = total_box = total_obj = total_cls = 0.0
    n_batches = 0
    for images, boxes_list, labels_list in tqdm(loader, disable=True):
        images = images.to(device, non_blocking=True)
        boxes_list = [b.to(device) for b in boxes_list]
        labels_list = [l.to(device).long() for l in labels_list]

        optimizer.zero_grad()
        preds = model(images)
        B, A, GH, GW, D = preds.shape
        assert D == 5 + criterion.num_classes, f"pred last dim {D} != 5+{criterion.num_classes}"

        loss, d = criterion(preds, boxes_list, labels_list)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        total_loss += d['total']; total_box += d['box']; total_obj += d['obj']; total_cls += d['cls']
        n_batches += 1

    return { 'total': total_loss / n_batches, 'box': total_box / n_batches, 'obj': total_obj / n_batches, 'cls': total_cls / n_batches }

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = total_box = total_obj = total_cls = 0.0
    n_batches = 0
    with torch.no_grad():
        for images, boxes_list, labels_list in tqdm(loader, disable=True):
            images = images.to(device, non_blocking=True)
            boxes_list = [b.to(device) for b in boxes_list]
            labels_list = [l.to(device).long() for l in labels_list]

            preds = model(images)
            B, A, GH, GW, D = preds.shape
            assert D == 5 + criterion.num_classes, f"pred last dim {D} != 5+{criterion.num_classes}"

            loss, d = criterion(preds, boxes_list, labels_list)
            total_loss += d['total']; total_box += d['box']; total_obj += d['obj']; total_cls += d['cls']
            n_batches += 1

    return { 'total': total_loss / n_batches, 'box': total_box / n_batches, 'obj': total_obj / n_batches, 'cls': total_cls / n_batches }

best_loss = float('inf')
best_wts = None
epochs_no_improve = 0

The `train_detection_model` main loop continuously runs the training and validation steps for a specified total number of epochs. In each epoch, it monitors the validation loss and records the model's weights (best_model_wts) if this loss is the lowest value ever recorded. The Early Stopping mechanism terminates the training loop early if the validation loss does not improve over a specified patience=10. This prevents unnecessary calculations and prevents overfitting of the model.

In [None]:
def train_detection_model(model, train_loader, val_loader, criterion,
                          optimizer, scheduler, num_epochs, device,
                          early_stopping_patience=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    epochs_no_improve = 0

    history = {
        'train_loss': [],
        'train_box_loss': [],
        'train_obj_loss': [],
        'train_cls_loss': [],
        'val_loss': [],
        'val_box_loss': [],
        'val_obj_loss': [],
        'val_cls_loss': []
    }

    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 70)

        # Train
        train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device)

        # Validate
        val_losses = validate(model, val_loader, criterion, device)

        # Scheduler step
        scheduler.step(val_losses['total'])

        # Save history
        history['train_loss'].append(train_losses['total'])
        history['train_box_loss'].append(train_losses['box'])
        history['train_obj_loss'].append(train_losses['obj'])
        history['train_cls_loss'].append(train_losses['cls'])
        history['val_loss'].append(val_losses['total'])
        history['val_box_loss'].append(val_losses['box'])
        history['val_obj_loss'].append(val_losses['obj'])
        history['val_cls_loss'].append(val_losses['cls'])

        # Print results
        print(f"Train Loss: {train_losses['total']:.4f} "
              f"(box: {train_losses['box']:.4f}, "
              f"obj: {train_losses['obj']:.4f}, "
              f"cls: {train_losses['cls']:.4f})")

        print(f"Val Loss: {val_losses['total']:.4f} "
              f"(box: {val_losses['box']:.4f}, "
              f"obj: {val_losses['obj']:.4f}, "
              f"cls: {val_losses['cls']:.4f})")

        current_lr = optimizer.param_groups[0]['lr']
        print(f"LR: {current_lr:.6f}")

        # Save best model
        if val_losses['total'] < best_loss:
            best_loss = val_losses['total']
            best_model_wts = copy.deepcopy(model.state_dict())
            epochs_no_improve = 0
            print(f"Best model updated! (Val Loss: {val_losses['total']:.4f})")
        else:
            epochs_no_improve += 1
            print(f"{epochs_no_improve}/{early_stopping_patience} - No improvement")

        # Early stopping
        if epochs_no_improve >= early_stopping_patience:
            print(f"\nEarly Stopping at epoch {epoch+1}")
            break

    time_elapsed = time.time() - since
    print(f'\nTraining completed: {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best Val Loss: {best_loss:.4f}')

    # Load best weights
    model.load_state_dict(best_model_wts)

    return model, history

In [None]:
NUM_EPOCHS = 75
EARLY_STOP_PATIENCE = 10

trained_model, history = train_detection_model(
    model=model,
    train_loader=train_det_loader,
    val_loader=val_det_loader,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=75,
    device=device,
    early_stopping_patience=10
)


Epoch 1/75
----------------------------------------------------------------------
Train Loss: 6847.7146 (box: 1365.9194, obj: 12.8226, cls: 5.2951)
Val Loss: 15.6434 (box: 1.1043, obj: 5.6879, cls: 4.4339)
LR: 0.001000
Best model updated! (Val Loss: 15.6434)

Epoch 2/75
----------------------------------------------------------------------
Train Loss: 13.6048 (box: 0.8284, obj: 5.5732, cls: 3.8897)
Val Loss: 13.8239 (box: 0.8293, obj: 5.4168, cls: 4.2607)
LR: 0.001000
Best model updated! (Val Loss: 13.8239)

Epoch 3/75
----------------------------------------------------------------------
Train Loss: 12.6317 (box: 0.7537, obj: 5.3871, cls: 3.4762)
Val Loss: 14.0645 (box: 0.8464, obj: 5.2295, cls: 4.6032)
LR: 0.001000
1/10 - No improvement

Epoch 4/75
----------------------------------------------------------------------
Train Loss: 12.1653 (box: 0.6925, obj: 5.2820, cls: 3.4209)
Val Loss: 13.7242 (box: 0.8761, obj: 5.3421, cls: 4.0014)
LR: 0.001000
Best model updated! (Val Loss: 13.72

IOU (Intersection Over Union) is a fundamental metric in object detection that measures how accurately a predicted box encompasses the actual box. This function calculates an IOU matrix by dividing the intersection area between two sets of boxes (boxes1 and boxes2) by the union area. A high IOU (0.5 and above) indicates a successful prediction and is a requirement for being considered True Positive in mAP calculations.

In [None]:
def box_iou(boxes1, boxes2):

    area1 = (boxes1[:,2] - boxes1[:,0]) * (boxes1[:,3] - boxes1[:,1])
    area2 = (boxes2[:,2] - boxes2[:,0]) * (boxes2[:,3] - boxes2[:,1])

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])

    wh = (rb - lt).clamp(min=0)
    inter = wh[:,:,0] * wh[:,:,1]

    union = area1[:, None] + area2 - inter
    return inter / union

The `calculate_ap_per_class` function calculates the Average Precision (AP) by determining the area under the Precision-Recall curve for a single class. The model's predictions are ranked according to their confidence score, True Positive (TP) and False Positive (FP) decisions are made, and Precision and Recall values ​​are extracted. This function calculates the AP value using an 11-point sampling method (0.0 to 1.0 with 0.1 intervals).

In [None]:
def calculate_ap_per_class(pred_boxes, pred_scores, true_boxes, iou_threshold=0.5):
    if len(pred_boxes) == 0 or len(true_boxes) == 0:
        return 0.0

    order = torch.argsort(pred_scores, descending=True)
    pred_boxes = pred_boxes[order]
    pred_scores = pred_scores[order]

    ious = box_iou(pred_boxes, true_boxes)
    gt_matched = torch.zeros(len(true_boxes), dtype=torch.bool)

    tp, fp = [], []
    for i in range(len(pred_boxes)):
        iou_row = ious[i]
        iou_max = iou_row.max()
        j = int(iou_row.argmax())
        if iou_max >= iou_threshold and not gt_matched[j]:
            tp.append(1); fp.append(0)
            gt_matched[j] = True
        else:
            tp.append(0); fp.append(1)

    tp = torch.tensor(tp).cumsum(0)
    fp = torch.tensor(fp).cumsum(0)
    recalls = tp / max(len(true_boxes), 1)
    precisions = tp / (tp + fp + 1e-6)

    precisions = torch.clamp(precisions, max=1.0)

    ap = 0.0
    for t in torch.linspace(0,1,11):
        mask = recalls >= t
        if mask.any():
            p = torch.max(precisions[mask]).item()
            p = min(1.0, p)
        else:
            p = 0.0
        ap += p/11.0

    ap = min(1.0, ap)
    return ap


The `evaluate_map` function calculates the mAP (Mean Average Precision) score, which is the final metric of the model. Model predictions are made for all images in the validation data loader, and Non-Maximum Suppression (NMS) is applied to these predictions. Then, AP is calculated separately for each error class, and the average of the AP values ​​of all classes is taken to obtain the mAP@0.5 score (64.16%).

In [None]:
def evaluate_map(model, dataloader, class_names, device, conf_threshold=0.2, iou_threshold=0.5):
    model.eval()
    aps = {cls: [] for cls in class_names}

    with torch.no_grad():
        for images, boxes_list, labels_list in dataloader:
            images = images.to(device)
            preds = model(images)

            pred_boxes, pred_scores, pred_labels = decode_predictions(
                preds, model.anchors, confidence_threshold=conf_threshold
            )

            for i in range(len(images)):
                gt_boxes = boxes_list[i].to(device)
                gt_labels = labels_list[i].to(device)

                for cls_idx, cls_name in enumerate(class_names):
                    pm = (pred_labels[i] == cls_idx)
                    gm = (gt_labels == cls_idx)
                    if pm.any() and gm.any():
                        ap = calculate_ap_per_class(
                            pred_boxes[i][pm], pred_scores[i][pm], gt_boxes[gm], iou_threshold
                        )
                        aps[cls_name].append(ap)

    final_aps = {cls: (sum(vals)/len(vals) if len(vals) > 0 else 0.0) for cls, vals in aps.items()}
    map_score = sum(final_aps.values())/len(final_aps)

    print("AP per class:", {k: f"{v:.4f}" for k,v in final_aps.items()})
    print(f"mAP@{iou_threshold}: {map_score:.4f} ({map_score*100:.2f}%)")
    return map_score, final_aps

In [None]:
# mAP evaluation
map_score, class_aps = evaluate_map(
    model=trained_model,
    dataloader=val_det_loader,
    class_names=class_names,
    device=device
)

print("AP per class:", {k: f"{v:.4f}" for k, v in class_aps.items()})
print(f"mAP@0.5: {map_score:.4f} ({map_score*100:.2f}%)")

AP per class: {'crazing': '0.6025', 'inclusion': '0.7316', 'patches': '0.7721', 'pitted_surface': '0.8230', 'rolled-in_scale': '0.5617', 'scratches': '0.3589'}
mAP@0.5: 0.6416 (64.16%)
AP per class: {'crazing': '0.6025', 'inclusion': '0.7316', 'patches': '0.7721', 'pitted_surface': '0.8230', 'rolled-in_scale': '0.5617', 'scratches': '0.3589'}
mAP@0.5: 0.6416 (64.16%)
