Using pre-trained object detection models from pytorch  
Fine-tuned on our reduced coco-dataset (for fair comparison)

Reference: 

https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

https://www.kaggle.com/code/yerramvarun/fine-tuning-faster-rcnn-using-pytorch

https://github.com/sovit-123/fasterrcnn-pytorch-training-pipeline/tree/main

In [1]:
from pycocotools.coco import COCO
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import cv2

from helper import print_log

In [2]:
train2017 = 'train2017'
val2017 = 'val2017'
ann_file = 'dataset/coco/annotations/instances_{}.json'

In [3]:
TOP_10_CATS_ID = set([1,  3, 62, 84, 44, 47, 67, 51, 10, 31])
CATS_NAMES = {
    1: 'person',
    3: 'car',
    62: 'chair',
    84: 'book',
    44: 'bottle',
    47: 'cup',
    67: 'dinning table',
    51: 'traffic light',
    10: 'bowl',
    31: 'handbag'
}
LABELS = [CATS_NAMES[id] for id in sorted(list(TOP_10_CATS_ID))]

In [4]:
coco_train = COCO(ann_file.format(train2017))
coco_val = COCO(ann_file.format(val2017))

loading annotations into memory...
Done (t=7.38s)
creating index...
index created!
loading annotations into memory...
Done (t=0.22s)
creating index...
index created!


In [5]:
def get_coco_images_and_labels(coco):

    # get all filenames
    img_ids_w_filename = {coco.dataset['images'][i]['id']: coco.dataset['images'][i]['file_name'] for i in range(len(coco.dataset['images']))}      # use dictionary for faster query

    # get all images
    img_ids = [coco.dataset['images'][i]['id'] for i in range(len(coco.dataset['images']))]

    # load labels for each imgs (as one img may have multiple labels)
    labels_per_imgs = []
    for i in range(len(img_ids)):
        labels_per_imgs.append(coco.loadAnns(coco.getAnnIds(imgIds=img_ids[i])))

    img_id_w_bb = []
    label_per_obj = []

    for labels in labels_per_imgs:
        for l in labels:
            img_id_w_bb.append((l['id'], l['image_id'], l['bbox']))
            label_per_obj.append(l['category_id'])

    return img_ids_w_filename, img_id_w_bb, label_per_obj

In [6]:
img_ids_w_filename_train, img_id_w_bb_train, label_per_obj_train = get_coco_images_and_labels(coco_train)
img_ids_w_filename_val, img_id_w_bb_val, label_per_obj_val = get_coco_images_and_labels(coco_val)

---

Dataset save/load

In [7]:
# load filtered dataset

import pickle

filtered_dataset_dir = Path('dataset/coco_top10_filtered_20250423')

with open(filtered_dataset_dir / 'img_id_w_bb_train_top10_v2.pkl', 'rb') as f:
    img_id_w_bb_train_top10_filtered = pickle.load(f)
# with open(filtered_dataset_dir / 'label_per_obj_train_top10_v2.pkl', 'rb') as f:
#     label_per_obj_train_top10_filtered = pickle.load(f)

with open(filtered_dataset_dir/ 'img_id_w_bb_val_top10.pkl', 'rb') as f:
    img_id_w_bb_val_top10 = pickle.load(f)
# with open(filtered_dataset_dir / 'label_per_obj_val_top10.pkl', 'rb') as f:
#     label_per_obj_val_top10 = pickle.load(f)

In [8]:
len(img_id_w_bb_train_top10_filtered)

62444

In [9]:
# create a set of all image ids
img_ids_train = list(set([img_id for _, img_id, _ in img_id_w_bb_train_top10_filtered]))
img_ids_val = list(set([img_id for _, img_id, _ in img_id_w_bb_val_top10]))

len(img_ids_train), len(img_ids_val)

(15843, 3559)

In [10]:
# for each item in img_ids_train, or img_ids_val, get the corresponding interested bbox

def get_img_id_w_bb(img_ids, coco_ds):
    img_id_w_bb = {}
    for img_id in img_ids:
        ann_ids = coco_ds.getAnnIds(imgIds=img_id, catIds=TOP_10_CATS_ID)
        anns = coco_ds.loadAnns(ann_ids)
        img_id_w_bb[img_id] = [
            (ann['id'], ann['category_id'], ann['bbox']) for ann in anns if ann['category_id'] in TOP_10_CATS_ID
        ]
    return img_id_w_bb

In [11]:
label_per_img_train_top10 = get_img_id_w_bb(img_ids_train, coco_train)
label_per_img_val_top10 = get_img_id_w_bb(img_ids_val, coco_val)

In [12]:
label_per_img_train_top10

{196608: [(1706882, 1, [0.0, 1.08, 325.75, 473.53])],
 131074: [(1174042, 31, [320.05, 248.05, 79.89, 83.07])],
 557059: [(1901023, 51, [4.97, 1.85, 475.03, 628.4])],
 458756: [(712616, 51, [77.49, 50.58, 486.46, 403.59])],
 262146: [(454022, 1, [220.99, 263.96, 143.49, 210.95])],
 294914: [(1706622, 1, [0.95, 344.28, 115.05, 72.25]),
  (2078769, 31, [0.0, 124.74, 197.6, 204.22])],
 229378: [(1486806, 44, [274.73, 31.23, 24.98, 73.26]),
  (1488835, 44, [256.43, 46.17, 18.21, 34.24]),
  (1489815, 44, [300.0, 32.21, 29.77, 74.42]),
  (1866505, 44, [314.86, 31.39, 19.38, 71.16]),
  (1869390, 44, [258.65, 72.41, 13.38, 31.5]),
  (1871797, 44, [235.7, 17.43, 23.94, 51.33]),
  (1873004, 44, [271.07, 18.81, 16.26, 81.87]),
  (1873016, 44, [216.78, 45.35, 22.69, 55.42]),
  (1873103, 44, [298.83, 34.35, 12.67, 20.59])],
 557065: [(1483058, 44, [457.73, 295.77, 29.74, 52.57])],
 163852: [(441863, 1, [1.91, 0.0, 535.14, 426.0]),
  (1173551, 31, [67.97, 88.07, 194.33, 323.57])],
 65554: [(1738804,

In [13]:
label_per_img_val_top10[458755]

[(186574, 1, [69.03, 37.75, 508.05, 435.78]),
 (203334, 1, [567.0, 1.27, 73.0, 86.86]),
 (1684285, 1, [590.02, 91.69, 49.62, 97.07]),
 (2206717, 1, [250.28, 87.33, 122.48, 132.42])]

In [14]:
len(img_id_w_bb_val_top10)

20312

In [15]:
img_id_w_bb_train_top10_filtered[0]

(82215, 309022, [55.5, 228.79, 7.09, 58.9])

In [16]:
img_ids_w_filename_train[309022]

'000000309022.jpg'

---

Create train-val-test set

Note that the split will be slightly different, due to the different strategy in handling images-to-bbox relationship

In [17]:
import torch

from torchvision.io import read_image
from torchvision.transforms import v2

class ReducedCOCODataset(torch.utils.data.Dataset):
    def __init__(self, img_ids, label_per_img:dict, img_ids_w_filename, coco_ds_name:str, transforms):
        self.img_ids = img_ids
        self.label_per_img = label_per_img
        self.img_ids_w_filename = img_ids_w_filename
        self.coco_ds_name = coco_ds_name
        self.transforms = transforms
        self.labels = ['background'] + LABELS

        assert self.coco_ds_name in ['train2017', 'val2017'], f"Invalid coco dataset name: {self.coco_ds_name}"

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_name = self.img_ids_w_filename[img_id]

        # load image
        img_path = Path(f"dataset/coco/{self.coco_ds_name}/{img_name}")
        img = read_image(img_path)

        # load the corresponding bbox(es), and labels for the image
        l = self.label_per_img[img_id]
        bboxes = []
        _labels = []
        for (_, cat_id, bbox) in l:
            _labels.append(self.labels.index(CATS_NAMES[cat_id]))
            # convert bbox from [x1, y1, w, h] to [x1, y1, x2, y2]
            x1, y1, w, h = bbox
            x2 = x1 + w
            y2 = y1 + h
            bboxes.append([x1, y1, x2, y2])
         
        l_bboxes = len(bboxes)

        # bboxes to tensor
        bboxes = torch.tensor(bboxes, dtype=torch.float32)
        # area of bboxes
        area = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]) if l_bboxes > 0 else torch.as_tensor(bboxes, dtype=torch.float32)
        # No crowd instances
        iscrowd = torch.zeros((bboxes.shape[0],), dtype=torch.int64) if l_bboxes > 0 else torch.as_tensor(bboxes, dtype=torch.float32)
        # Labels to tensor
        labels = torch.as_tensor(_labels, dtype=torch.int64)

        target = {}
        target["boxes"] = bboxes
        target["labels"] = labels
        target['area'] = area
        target['iscrowd'] = iscrowd
        target['image_id'] = torch.tensor([img_id])

        if self.transforms:
            img_t, target = self.transforms(img, target)
        else:
            img_t = img

        return img_t, target
    
    def get_details_from_id(self, idx):
        img_id = self.img_ids[idx]
        img_name = self.img_ids_w_filename[img_id]

        # load image
        img_path = Path(f"dataset/coco/{self.coco_ds_name}/{img_name}")
        img = read_image(img_path)

        # load the corresponding bbox(es), and labels for the image
        l = self.label_per_img[img_id]
        bboxes = []
        _labels = []
        for (_, cat_id, bbox) in l:
            _labels.append(CATS_NAMES[cat_id])
            # convert bbox from [x1, y1, w, h] to [x1, y1, x2, y2]
            x1, y1, w, h = bbox
            x2 = x1 + w
            y2 = y1 + h
            bboxes.append([x1, y1, x2, y2])

        return img_path, img, bboxes, _labels
    
    def __len__(self):
        return len(self.img_ids)
        

In [18]:
# Since v0.15.0 torchvision provides new Transforms API to easily write data augmentation pipelines for Object Detection and Segmentation tasks.
# Let’s write some helper functions for data augmentation / transformation:

from torchvision.transforms import v2 as T

def get_transform(train):
    transforms = []
    # if train:
    #     transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

In [19]:
# check dataset
dataset_traintest = ReducedCOCODataset(
    img_ids_train,
    label_per_img_train_top10,
    img_ids_w_filename_train,
    coco_ds_name='train2017',
    transforms=get_transform(train=True)
)
print('length of dataset = ', len(dataset_traintest), '\n')

# getting the image and target of the dataset
img, target = dataset_traintest[20]
print(img, '\n',target)

length of dataset =  15843 

tensor([[[0.5137, 0.5373, 0.5059,  ..., 0.6863, 0.6784, 0.6824],
         [0.5451, 0.5333, 0.5216,  ..., 0.7020, 0.7020, 0.6902],
         [0.5176, 0.5098, 0.5294,  ..., 0.7059, 0.7059, 0.6941],
         ...,
         [0.4039, 0.4235, 0.4275,  ..., 0.6667, 0.6549, 0.6392],
         [0.4510, 0.4078, 0.4039,  ..., 0.6627, 0.6549, 0.6431],
         [0.4078, 0.3765, 0.4431,  ..., 0.6588, 0.6588, 0.6510]],

        [[0.3922, 0.3922, 0.4314,  ..., 0.6431, 0.6431, 0.6471],
         [0.4118, 0.4431, 0.4471,  ..., 0.6549, 0.6588, 0.6549],
         [0.3961, 0.4039, 0.4235,  ..., 0.6667, 0.6627, 0.6471],
         ...,
         [0.3098, 0.3176, 0.3020,  ..., 0.5098, 0.5176, 0.5059],
         [0.2706, 0.3137, 0.2980,  ..., 0.5059, 0.5176, 0.5020],
         [0.2941, 0.3333, 0.2863,  ..., 0.5020, 0.5098, 0.4980]],

        [[0.2824, 0.2863, 0.2745,  ..., 0.6196, 0.6235, 0.6588],
         [0.2314, 0.2784, 0.3176,  ..., 0.6706, 0.6510, 0.6353],
         [0.3843, 0.3216, 0.3

In [20]:
dataset_traintest.labels, len(dataset_traintest.labels)

(['background',
  'person',
  'car',
  'bowl',
  'handbag',
  'bottle',
  'cup',
  'traffic light',
  'chair',
  'dinning table',
  'book'],
 11)

In [21]:
# validation set

dataset_val = ReducedCOCODataset(
    img_ids_val,
    label_per_img_val_top10,
    img_ids_w_filename_val,
    coco_ds_name='val2017',
    transforms=get_transform(train=False)
)
print('length of dataset = ', len(dataset_val), '\n')
# getting the image and target of the dataset
img, target = dataset_val[1]
print(img, '\n',target)

length of dataset =  3559 

tensor([[[0.2980, 0.2471, 0.2039,  ..., 0.0431, 0.0784, 0.0471],
         [0.3098, 0.2706, 0.2275,  ..., 0.0667, 0.0549, 0.0627],
         [0.2863, 0.1922, 0.2235,  ..., 0.0510, 0.0549, 0.0745],
         ...,
         [0.2431, 0.2510, 0.2863,  ..., 0.4588, 0.2431, 0.1255],
         [0.2902, 0.3020, 0.3608,  ..., 0.4314, 0.3647, 0.2118],
         [0.2588, 0.2431, 0.3098,  ..., 0.2431, 0.1843, 0.3294]],

        [[0.2588, 0.2118, 0.1490,  ..., 0.0510, 0.0549, 0.0627],
         [0.2980, 0.3098, 0.2235,  ..., 0.0784, 0.0510, 0.0667],
         [0.3294, 0.2078, 0.1569,  ..., 0.0706, 0.0588, 0.0627],
         ...,
         [0.2667, 0.3020, 0.2824,  ..., 0.4314, 0.1569, 0.1137],
         [0.2902, 0.2980, 0.3490,  ..., 0.3137, 0.2941, 0.1725],
         [0.1922, 0.2196, 0.3216,  ..., 0.0941, 0.1569, 0.2627]],

        [[0.2118, 0.1843, 0.1490,  ..., 0.1098, 0.1020, 0.0667],
         [0.2627, 0.2745, 0.2039,  ..., 0.1137, 0.0431, 0.0863],
         [0.2627, 0.1529, 0.12

---

---

Load the pre-trained faster rcnn network

In [22]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, fasterrcnn_mobilenet_v3_large_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# using mobile net for faster training, while have decent accuracy
model = fasterrcnn_mobilenet_v3_large_fpn()

n_classes = len(dataset_traintest.labels)

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)

In [23]:
from torchinfo import summary

summary(model, input_size=(8, 3, 224, 224))

Layer (type:depth-idx)                                  Output Shape              Param #
FasterRCNN                                              [100, 4]                  --
├─GeneralizedRCNNTransform: 1-1                         [8, 3, 800, 800]          --
├─BackboneWithFPN: 1-2                                  [8, 256, 13, 13]          --
│    └─IntermediateLayerGetter: 2-1                     [8, 960, 25, 25]          --
│    │    └─Conv2dNormActivation: 3-1                   [8, 16, 400, 400]         (432)
│    │    └─InvertedResidual: 3-2                       [8, 16, 400, 400]         (400)
│    │    └─InvertedResidual: 3-3                       [8, 24, 200, 200]         (3,136)
│    │    └─InvertedResidual: 3-4                       [8, 24, 200, 200]         (4,104)
│    │    └─InvertedResidual: 3-5                       [8, 40, 100, 100]         (9,960)
│    │    └─InvertedResidual: 3-6                       [8, 40, 100, 100]         (20,432)
│    │    └─InvertedResidual: 3-7

Training

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [25]:
from sklearn.model_selection import train_test_split

BATCH_SIZE = 2

def collate_fn(batch):
  return tuple(zip(*batch))

# create train and validation set
train_indices, test_indices = train_test_split(list(range(len(dataset_traintest.img_ids))), test_size=0.2, random_state=42)
dataset_train = torch.utils.data.Subset(dataset_traintest, train_indices)
dataset_test = torch.utils.data.Subset(dataset_traintest, test_indices)
# create data loaders
data_loader_train = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    collate_fn=collate_fn,
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn,
)

data_loader_valid = torch.utils.data.DataLoader(
    dataset_val,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn,
)

In [26]:
len(data_loader_train.dataset), len(data_loader_test.dataset), len(data_loader_valid.dataset)

(12674, 3169, 3559)

In [27]:
model.load_state_dict(torch.load('best_model.pth', weights_only=True))

<All keys matched successfully>

In [28]:
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
          )
        )
      )
      (2): InvertedResidual(
        (block):

In [29]:
N_EPOCHS = 3

# construct optimizer
params = [p for p in model.parameters() if p.requires_grad]
# optimizer = torch.optim.AdamW(params, lr=5e-3)
# following the tutorial first. Not sure if this is the best optimizer
optimizer = torch.optim.SGD(params, lr=0.001,
                        momentum=0.9,
                        weight_decay=0.0005)

# construct learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=N_EPOCHS, gamma=0.5)

In [30]:
from tqdm import tqdm
import matplotlib.pyplot as plt

'''
Function to train the model over one epoch.
'''
def train_one_epoch(model, optimizer, data_loader, device):
  train_loss_list = []

  tqdm_bar = tqdm(data_loader, total=len(data_loader))
  for idx, data in enumerate(tqdm_bar):
    optimizer.zero_grad()
    images, targets = data

    # print(targets)

    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]  # targets = {'boxes'=tensor, 'labels'=tensor}

    losses = model(images, targets)

    loss = sum(loss for loss in losses.values())
    loss_val = loss.item()
    train_loss_list.append(loss.detach().cpu().numpy())

    loss.backward()
    optimizer.step()

    tqdm_bar.set_description(desc=f"Training Loss: {loss_val:.3f}")

  return train_loss_list

In [31]:
'''
Function to validate the model

The losses output will be a dictionary with the following keys (and sample values):
{'loss_classifier': tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.1033, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.1994, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0580, device='cuda:0', grad_fn=<DivBackward0>)}
'''

def evaluate(model, data_loader_test, device):
    val_loss_list = []

    tqdm_bar = tqdm(data_loader_test, total=len(data_loader_test))

    for i, data in enumerate(tqdm_bar):
        images, targets = data

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with torch.no_grad():
            losses = model(images, targets)

        loss = sum(loss for loss in losses.values())
        loss_val = loss.item()
        val_loss_list.append(loss_val)

        tqdm_bar.set_description(desc=f"Testing Loss: {loss:.4f}")
    return val_loss_list


In [32]:
'''
Function to plot training and valdiation losses and save them in `output_dir'
'''
def plot_loss(train_loss, valid_loss):
    figure_1, train_ax = plt.subplots()
    figure_2, valid_ax = plt.subplots()

    train_ax.plot(train_loss, color='blue')
    train_ax.set_xlabel('Iteration')
    train_ax.set_ylabel('Training Loss')

    valid_ax.plot(valid_loss, color='red')
    valid_ax.set_xlabel('Iteration')
    valid_ax.set_ylabel('Validation loss')

    # figure_1.savefig(f"{OUTPUT_DIR}/train_loss.png")
    # figure_2.savefig(f"{OUTPUT_DIR}/valid_loss.png")

In [32]:
from tqdm import tqdm

loss_dict = {'train_loss': [], 'test_loss': []}
best_model = None

for epoch in range(N_EPOCHS):
    print("----------Epoch {}----------".format(epoch+1))

    # Train the model for one epoch
    train_loss_list = train_one_epoch(model, optimizer, data_loader_train, device)
    loss_dict['train_loss'].extend(train_loss_list)

    lr_scheduler.step()

    # Run evaluation to get losses
    test_loss_list = evaluate(model, data_loader_test, device)
    loss_dict['test_loss'].extend(test_loss_list)

    # store the best model
    if best_model is None or min(test_loss_list) < min(loss_dict['test_loss']):
        best_model = model.state_dict()

    break

    # Svae the model ckpt after every epoch
    # ckpt_file_name = f"{OUTPUT_DIR}/epoch_{epoch+1}_model.pth"
    # torch.save({
    #     'epoch': epoch+1,
    #     'model_state_dict': model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'loss_dict': loss_dict
    # }, ckpt_file_name)


----------Epoch 1----------


Training Loss: 0.505: 100%|██████████| 6337/6337 [03:53<00:00, 27.12it/s]
Testing Loss: 0.1891: 100%|██████████| 1585/1585 [00:43<00:00, 36.24it/s]


In [33]:
# save the best model
torch.save(best_model, "best_model.pth")

In [53]:
_img, _target = dataset_val[0]

In [60]:
_target

{'boxes': tensor([[250.8200, 168.2600, 320.9300, 233.1400],
         [435.3500, 294.2300, 448.8100, 302.0400],
         [447.4400, 293.9100, 459.6000, 301.5600],
         [460.5900, 291.7100, 473.3400, 300.1600],
         [407.0700, 287.2500, 419.7200, 297.1100],
         [618.0600, 289.3100, 629.6600, 297.2600],
         [512.3000, 294.0700, 533.4800, 299.6400],
         [285.5500, 370.5600, 297.6200, 389.7700]]),
 'labels': tensor([1, 2, 2, 2, 2, 2, 2, 1]),
 'area': tensor([4548.7363,  105.1225,   93.0240,  107.7377,  124.7288,   92.2199,
          117.9727,  231.8647]),
 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 'image_id': tensor([532481])}

In [65]:
model.eval()

# _images = list(image.to(device) for image in images)
_targets = [{k: v.to(device) for k, v in t.items()} for t in [_target]]

prediction = model([_img.to(device)], _targets)

In [66]:
prediction

[{'boxes': tensor([[228.5561, 162.7497, 324.4900, 276.3049],
          [ 60.8669,  42.7334,  97.8022, 129.5400],
          [269.1109, 163.7460, 309.8487, 258.3119],
          [238.0428, 165.0763, 313.6662, 229.2963],
          [ 60.8862,  45.8456,  81.3751, 123.9119],
          [248.5014, 163.1061, 295.5199, 283.9343],
          [ 72.5282,  50.9944,  89.5937, 127.8398],
          [203.8169, 133.1695, 346.3714, 384.7503],
          [ 52.4172,  43.8782, 117.0351,  99.8453],
          [175.4869, 172.9174, 350.4914, 263.5399],
          [ 45.7011,  34.9295, 120.2405, 134.7366],
          [163.5344, 159.9768, 365.1031, 228.9554],
          [260.2890, 150.8506, 333.2132, 360.5005],
          [222.7567, 193.5051, 319.2958, 251.7576],
          [294.0408, 186.9751, 315.8139, 245.1031],
          [  0.0000,  44.0493, 103.7229, 124.7355]], device='cuda:0',
         grad_fn=<StackBackward0>),
  'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2], device='cuda:0'),
  'scores': tensor

---

Evaluation result with train, test, validation

Focus on two major types of metrics
- acc; confusion matrix
- mAP


In [33]:
def evaluation_pred(model, data_loader, stage:str):
    model.eval()
    target_list = {
        'boxes': [],
        'labels': [],
    }
    pred_list = {
        'boxes': [],
        'labels': [],
        'scores': []
    }
    with torch.no_grad():
        tqdm_bar = tqdm(data_loader, total=len(data_loader))
        for idx, data in enumerate(tqdm_bar):
            images, targets = data

            target_list['boxes'].append(targets['boxes'])
            target_list['labels'].append(targets['labels'])

            # print(targets)

            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            predictions = model(images, targets)
            pred_list.extend(predictions)

            pred_list['boxes'].append(predictions['boxes'].detch().cpu().numpy())
            pred_list['labels'].append(predictions['labels'].detch().cpu().numpy())
            pred_list['scores'].append(predictions['scores'].detch().cpu().numpy())

    return target_list, pred_list

In [None]:
# compute acc, confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

def compute_accuracy(y_true, y_pred):
    """Compute accuracy score"""
    return accuracy_score(y_true, y_pred)

def compute_classification_report(y_true, y_pred, labels):
    """Compute classification report"""
    return classification_report(y_true, y_pred, target_names=labels, zero_division=0)

def compute_confusion_matrix(y_true, y_pred, labels, save=False, save_path=None):
    """Compute confusion matrix"""
    cm_matrix = confusion_matrix(y_true, y_pred)

    cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm_matrix, display_labels=labels)
    fig, ax = plt.subplots(figsize=(12,12))
    ax.set_title('Confusion Matrix')
    cm_disp.plot(ax=ax, cmap=plt.cm.Blues, xticks_rotation=90)

    if save:
        if save_path is None:
            raise ValueError("save_path must be provided if save is True")
        plt.savefig(save_path)

def compute_classification_metrics(target_list, pred_list, target_names:list[str]):
    # flatten the list of lists
    target_labels = [item for sublist in target_list['labels'] for item in sublist]
    pred_labels = [item for sublist in pred_list['labels'] for item in sublist]

    # compute accuracy
    acc = compute_accuracy(target_labels, pred_labels)
    print("Accuracy: ", acc)

    report = compute_classification_report(target_labels, pred_labels, target_names)
    print("Classification Report:\n", report)

    # compute confusion matrix
    compute_confusion_matrix(target_labels, pred_labels, target_names, save=False)

    return target_labels, pred_labels

In [35]:
# compute mAP

from torchmetrics.detection.mean_ap import MeanAveragePrecision

def compute_mAP(target_list, pred_list):
    # flatten the list of lists
    target_boxes = [item for sublist in target_list['boxes'] for item in sublist]
    pred_boxes = [item for sublist in pred_list['boxes'] for item in sublist]

    # convert to tensor
    target_boxes = torch.tensor(target_boxes)
    pred_boxes = torch.tensor(pred_boxes)

    # compute mAP
    metric = MeanAveragePrecision()
    metric.update(pred_boxes, target_boxes)
    mAP = metric.compute()

    return mAP

In [29]:
# test the evaluation methods with dataset_val
from tqdm import tqdm

model.eval()

tqdm_bar = tqdm(data_loader_valid, total=len(data_loader_valid))
target_list = []
pred_list = []

for idx, data in enumerate(tqdm_bar):
    images, targets = data

    l_images = len(images)

    _images = list(image.to(device) for image in images)
    _targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

    with torch.no_grad():       # must have, otherwise it will leak memory/explode ur memory
        _predictions = model(_images, _targets)

    predictions = [{k:v.to('cpu') for k, v in t.items()} for t in _predictions]

    for i in range(l_images):

        target_dict = {}
        pred_dict = {}

        target_dict['boxes'] = targets[i]['boxes']
        target_dict['labels'] = targets[i]['labels']
        pred_dict['boxes'] = predictions[i]['boxes']
        pred_dict['labels'] = predictions[i]['labels']
        pred_dict['scores'] = predictions[i]['scores']

        target_list.append(target_dict)
        pred_list.append(pred_dict)

    tqdm_bar.set_description(desc=f"Validation")

Validation: 100%|██████████| 3559/3559 [01:02<00:00, 56.56it/s]


In [30]:
target_list

[{'boxes': tensor([[250.8200, 168.2600, 320.9300, 233.1400],
          [435.3500, 294.2300, 448.8100, 302.0400],
          [447.4400, 293.9100, 459.6000, 301.5600],
          [460.5900, 291.7100, 473.3400, 300.1600],
          [407.0700, 287.2500, 419.7200, 297.1100],
          [618.0600, 289.3100, 629.6600, 297.2600],
          [512.3000, 294.0700, 533.4800, 299.6400],
          [285.5500, 370.5600, 297.6200, 389.7700]]),
  'labels': tensor([1, 2, 2, 2, 2, 2, 2, 1])},
 {'boxes': tensor([[ 69.0300,  37.7500, 577.0800, 473.5300],
          [567.0000,   1.2700, 640.0000,  88.1300],
          [590.0200,  91.6900, 639.6400, 188.7600],
          [250.2800,  87.3300, 372.7600, 219.7500]]),
  'labels': tensor([1, 1, 1, 1])},
 {'boxes': tensor([[2.4809e+02, 2.7000e-01, 3.0202e+02, 1.6638e+02],
          [3.7991e+02, 0.0000e+00, 4.4233e+02, 1.8059e+02],
          [0.0000e+00, 1.6200e+00, 6.3892e+02, 4.7405e+02],
          [0.0000e+00, 3.0849e+02, 2.7290e+02, 4.7892e+02],
          [8.4320e+01, 

In [37]:
from torchmetrics.detection.mean_ap import MeanAveragePrecision

# compute mAP
metric = MeanAveragePrecision()
metric.update(pred_list, target_list)
mAP = metric.compute()

In [38]:
mAP

{'map': tensor(0.0616),
 'map_50': tensor(0.1459),
 'map_75': tensor(0.0390),
 'map_small': tensor(0.0041),
 'map_medium': tensor(0.0600),
 'map_large': tensor(0.1433),
 'mar_1': tensor(0.0789),
 'mar_10': tensor(0.1464),
 'mar_100': tensor(0.1689),
 'mar_small': tensor(0.0275),
 'mar_medium': tensor(0.1665),
 'mar_large': tensor(0.3491),
 'map_per_class': tensor(-1.),
 'mar_100_per_class': tensor(-1.),
 'classes': tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=torch.int32)}