Using pre-trained image classification models from pytorch  
Fine-tuned on our reduced coco-dataset (for fair comparison)

Reference: 

https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

In [1]:
from pycocotools.coco import COCO
import numpy as np
from pathlib import Path

from helper import print_log

In [2]:
train2017 = 'train2017'
val2017 = 'val2017'
ann_file = 'dataset/coco/annotations/instances_{}.json'

In [3]:
TOP_10_CATS_ID = set([1,  3, 62, 84, 44, 47, 67, 51, 10, 31])
CATS_NAMES = {
    1: 'person',
    3: 'car',
    62: 'chair',
    84: 'book',
    44: 'bottle',
    47: 'cup',
    67: 'dinning table',
    51: 'traffic light',
    10: 'bowl',
    31: 'handbag'
}
LABEL_LOGITS_MAPPING = {
    1: 0,
    3: 1,
    62: 2,
    84: 3,
    44: 4,
    47: 5,
    67: 6,
    51: 7,
    10: 8,
    31: 9
}
LOGITS_LABEL_MAPPING = {v:k for k, v in LABEL_LOGITS_MAPPING.items()}
LABELS = [CATS_NAMES[k] for k in LABEL_LOGITS_MAPPING.keys()]

In [4]:
coco_train = COCO(ann_file.format(train2017))
coco_val = COCO(ann_file.format(val2017))

loading annotations into memory...
Done (t=6.49s)
creating index...
index created!
loading annotations into memory...
Done (t=0.90s)
creating index...
index created!


In [5]:
def get_coco_images_and_labels(coco):

    # get all filenames
    img_ids_w_filename = {coco.dataset['images'][i]['id']: coco.dataset['images'][i]['file_name'] for i in range(len(coco.dataset['images']))}      # use dictionary for faster query

    # get all images
    img_ids = [coco.dataset['images'][i]['id'] for i in range(len(coco.dataset['images']))]

    # load labels for each imgs (as one img may have multiple labels)
    labels_per_imgs = []
    for i in range(len(img_ids)):
        labels_per_imgs.append(coco.loadAnns(coco.getAnnIds(imgIds=img_ids[i])))

    img_id_w_bb = []
    label_per_obj = []

    for labels in labels_per_imgs:
        for l in labels:
            img_id_w_bb.append((l['id'], l['image_id'], l['bbox']))
            label_per_obj.append(l['category_id'])

    return img_ids_w_filename, img_id_w_bb, label_per_obj

In [6]:
img_ids_w_filename_train, img_id_w_bb_train, label_per_obj_train = get_coco_images_and_labels(coco_train)
img_ids_w_filename_val, img_id_w_bb_val, label_per_obj_val = get_coco_images_and_labels(coco_val)

---

Dataset save/load

In [7]:
# load filtered dataset

import pickle

filtered_dataset_dir = Path('dataset/coco_top10_filtered_20250423')

with open(filtered_dataset_dir / 'img_id_w_bb_train_top10_v2.pkl', 'rb') as f:
    img_id_w_bb_train_top10_filtered = pickle.load(f)
with open(filtered_dataset_dir / 'label_per_obj_train_top10_v2.pkl', 'rb') as f:
    label_per_obj_train_top10_filtered = pickle.load(f)

with open(filtered_dataset_dir/ 'img_id_w_bb_val_top10.pkl', 'rb') as f:
    img_id_w_bb_val_top10 = pickle.load(f)
with open(filtered_dataset_dir / 'label_per_obj_val_top10.pkl', 'rb') as f:
    label_per_obj_val_top10 = pickle.load(f)

In [8]:
img_id_w_bb_train[0]

(151091, 391895, [359.17, 146.17, 112.45, 213.57])

In [9]:
len(img_id_w_bb_train_top10_filtered), len(label_per_obj_train_top10_filtered), len(img_id_w_bb_val_top10), len(label_per_obj_val_top10)

(62444, 62444, 20312, 20312)

In [10]:
# convert labels to logits
label_per_obj_train_top10_filtered_logits = np.array([LABEL_LOGITS_MAPPING[l] for l in label_per_obj_train_top10_filtered], dtype=np.int32)
label_per_obj_val_top10_logits = np.array([LABEL_LOGITS_MAPPING[l] for l in label_per_obj_val_top10], dtype=np.int32)

In [11]:
label_per_obj_train_top10_filtered[:20]

[44, 44, 44, 44, 51, 44, 44, 44, 44, 44, 44, 1, 1, 51, 47, 44, 47, 47, 47, 47]

In [12]:
label_per_obj_train_top10_filtered_logits[:20]

array([4, 4, 4, 4, 7, 4, 4, 4, 4, 4, 4, 0, 0, 7, 5, 4, 5, 5, 5, 5],
      dtype=int32)

---

Create train-val-test set

Note that the split will be slightly different, due to the different strategy in handling images-to-bbox relationship

In [13]:
import torch
from torchvision.transforms import v2 as T
from torchvision.io import read_image

import math

class ReducedCOCODataset(torch.utils.data.Dataset):
    def __init__(self, img_id_w_bb:list, label_per_obj:list, img_ids_w_filename, coco_ds_name:str, transforms):
        self.img_id_w_bb = img_id_w_bb
        self.label_per_obj = label_per_obj
        self.img_ids_w_filename = img_ids_w_filename
        self.coco_ds_name = coco_ds_name
        self.transforms = transforms
        self.labels = LABELS

        self._def_transform = None

        assert self.coco_ds_name in ['train2017', 'val2017'], f"Invalid coco dataset name: {self.coco_ds_name}"

    def __getitem__(self, idx):
        # get the image id
        ann_id, img_id, bbox = self.img_id_w_bb[idx]
        label = self.label_per_obj[idx]

        # load image
        img_name = self.img_ids_w_filename[img_id]
        img_path = Path(f"dataset/coco/{self.coco_ds_name}/{img_name}")
        img = read_image(img_path)

        # chop the image to the bbox
        x1, y1, w, h = bbox
        x1 = int(math.floor(x1))
        y1 = int(math.floor(y1))
        x2 = int(math.floor(x1 + w)) + 1
        y2 = int(math.floor(y1 + h)) + 1
        img = img[:, y1:y2, x1:x2]
        

        # any data augmentation (?)

        # apply transforms
        img_t, label = self._def_transform(img, label)

        return img_t, label
    
    def get_details_from_id(self, idx):
        ann_id, img_id, bbox = self.img_id_w_bb[idx]
        label = self.label_per_obj[idx]

        # load image
        img_name = self.img_ids_w_filename[img_id]
        img_path = Path(f"dataset/coco/{self.coco_ds_name}/{img_name}")
        img = read_image(img_path)       

        return img_path, img, bbox, label
    
    def set_def_transform(self, transform):
        self._def_transform = transform

    def __len__(self):
        return len(self.img_id_w_bb)
        

In [14]:
# check dataset
dataset_traintest = ReducedCOCODataset(
    img_id_w_bb_train_top10_filtered,
    label_per_obj_train_top10_filtered_logits,
    img_ids_w_filename_train,
    coco_ds_name='train2017',
    transforms=None
)
print('length of dataset = ', len(dataset_traintest), '\n')

length of dataset =  62444 



In [15]:
dataset_traintest.labels, len(dataset_traintest.labels)

(['person',
  'car',
  'chair',
  'book',
  'bottle',
  'cup',
  'dinning table',
  'traffic light',
  'bowl',
  'handbag'],
 10)

In [16]:
# validation set

dataset_val = ReducedCOCODataset(
    img_id_w_bb_val_top10,
    label_per_obj_val_top10_logits,
    img_ids_w_filename_val,
    coco_ds_name='val2017',
    transforms=None
)
print('length of dataset = ', len(dataset_val), '\n')
# getting the image and target of the dataset
# img, target = dataset_val[1]
# print(img, '\n',target)

length of dataset =  20312 



In [17]:
len(img_id_w_bb_val)

36781

In [18]:
len(label_per_obj_val_top10_logits)

20312

---

Model initialization

In [19]:
# script contents are moved to cnn_model_loader.py

---

Training loop with hyperparameter selection

we are looking for 
- batch size (16, 32, 64)
- learning rate (5e-4, 1e-4, 5e-5)

Total combinations: $3 \times 3 = 9$

In [20]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
from cnn_model_loader import ModelType, load_model, get_default_transforms

model_type = ModelType.EFFICIENTNET_B4

dataset_traintest.set_def_transform(
    get_default_transforms(model_type)
)

dataset_val.set_def_transform(
    get_default_transforms(model_type)
)

In [22]:
from itertools import product

BATCH_SIZE_GRID = [32]
LR_GRID = [1e-4, 5e-5, 1e-5]

MAX_EPOCHES = 15

hyperparam_combs = list(product(BATCH_SIZE_GRID, LR_GRID))
print('Total number of hyperparameter combinations: ', len(hyperparam_combs))

Total number of hyperparameter combinations:  3


In [23]:
from datetime import datetime

tdy = datetime.now()
top_model_dir = Path(f'models_coco/{str(model_type)}/{tdy.strftime("%Y%m%d-%H%M%S")}/')
if not top_model_dir.exists():
    top_model_dir.mkdir(parents=True)

In [24]:
from sklearn.model_selection import train_test_split

# create train and validation set
train_indices, test_indices = train_test_split(list(range(len(dataset_traintest.img_id_w_bb))), test_size=0.2, random_state=42)

dataset_train = torch.utils.data.Subset(dataset_traintest, train_indices)
dataset_test = torch.utils.data.Subset(dataset_traintest, test_indices)

In [25]:
def collate_fn(batch):
  return tuple(zip(*batch))

def build_data_loaders(batch_size):
    data_loader_train = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        collate_fn=collate_fn,
    )

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        collate_fn=collate_fn,
    )

    data_loader_valid = torch.utils.data.DataLoader(
        dataset_val,
        batch_size=16,      # fixed for inference
        shuffle=False,
        num_workers=4,
        collate_fn=collate_fn,
    )

    return data_loader_train, data_loader_test, data_loader_valid

In [26]:
from tqdm import tqdm
import matplotlib.pyplot as plt

'''
Function to train the model over one epoch.
'''
def train_one_epoch(model, criterion, optimizer, data_loader):
  
    train_loss = 0.0
    train_corrects = 0

    labels_list = []
    preds_list = []

    tqdm_bar = tqdm(data_loader, total=len(data_loader))
    for idx, data in enumerate(tqdm_bar):
        inputs, labels = data

        optimizer.zero_grad()

        # forward pass
        inputs = torch.stack(inputs, dim=0).to(DEVICE)
        labels = torch.tensor(labels, dtype=torch.float64).type(torch.LongTensor).to(DEVICE)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        train_loss += loss_val
        train_corrects += (preds == labels).sum().item()

        labels_list.append(labels)
        preds_list.append(preds)
        
        tqdm_bar.set_description(desc=f"Training Loss: {loss_val:.3f}")

    train_loss /= len(data_loader)
    acc = float(train_corrects) / len(data_loader.dataset)
    labels_list = torch.cat(labels_list).cpu().numpy()
    preds_list = torch.cat(preds_list).cpu().numpy()
    print_log(f"Avg training Loss: {train_loss:.3f}; Accuracy: {acc:.3f}")

    return train_loss, acc, labels_list, preds_list

In [27]:
def evaluate(model, criterion, data_loader):
    test_loss = 0.0
    test_corrects = 0

    labels_list = []
    preds_list = []

    tqdm_bar = tqdm(data_loader, total=len(data_loader))

    for i, data in enumerate(tqdm_bar):
        inputs, labels = data

        inputs = torch.stack(inputs, dim=0).to(DEVICE)
        labels = torch.tensor(labels, dtype=torch.float64).type(torch.LongTensor).to(DEVICE)

        with torch.no_grad():
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            

        loss_val = loss.item()
        test_loss += loss_val
        test_corrects += (preds == labels).sum().item()

        labels_list.append(labels)
        preds_list.append(preds)

        tqdm_bar.set_description(desc=f"Testing Loss: {loss_val:.4f}")

    labels_list = torch.cat(labels_list).cpu().numpy()
    preds_list = torch.cat(preds_list).cpu().numpy()
    test_loss /= len(data_loader)
    acc = float(test_corrects) / len(data_loader.dataset)
    print_log(f"Avg testing Loss: {test_loss:.3f}; Accuracy: {acc:.3f}")
    
    return test_loss, acc, labels_list, preds_list

In [28]:
def evaluation_pred(model, data_loader, stage:str):
    
    labels_list = []
    preds_list =[]

    model.eval()
    
    tqdm_bar = tqdm(data_loader, total=len(data_loader))
    for i, data in enumerate(tqdm_bar):
        inputs, labels = data

        images = torch.stack(inputs, dim=0).to(DEVICE)
        labels = torch.tensor(labels, dtype=torch.float64).type(torch.LongTensor).to(DEVICE)

        with torch.no_grad():
            outputs = model(images)
            _, preds = torch.max(outputs, 1)

        labels_list.append(labels)
        preds_list.append(preds)

        tqdm_bar.set_description(f"Evaluating {stage}")

    labels_list = torch.cat(labels_list).cpu().numpy()
    preds_list = torch.cat(preds_list).cpu().numpy()
    
    return labels_list, preds_list

In [29]:
# compute acc, confusion matrix, classification report
from helper_evaluations import compute_accuracy, compute_f1_score, compute_balanced_accuracy, compute_classification_report, compute_confusion_matrix

def compute_classification_metrics(target_labels, pred_labels, target_names:list[str]):
    
    # compute accuracy
    acc = compute_accuracy(target_labels, pred_labels)
    print("Accuracy: ", acc)

    report = compute_classification_report(target_labels, pred_labels, target_names)
    print("Classification Report:\n", report)

    # compute confusion matrix
    compute_confusion_matrix(target_labels, pred_labels, target_names, save=False)

    return target_labels, pred_labels

def save_evaluations(y, y_pred, labels, model_dir, eval_stage=str):
    """Save the evaluation results
    
    eval_stage: str
        The stage of the evaluation. It can be 'train', 'test' or 'val.
    """
    # Save the accuracy score
    accuracy = compute_accuracy(y, y_pred)
    f1 = compute_f1_score(y, y_pred)
    balanced_accuracy = compute_balanced_accuracy(y, y_pred)
    print_log(f"Accuracy [{eval_stage}]: {accuracy}; Weighted F1 [{eval_stage}]: {f1}; Weighted Accuracy [{eval_stage}]: {balanced_accuracy}")
    # save the scores
    with open(model_dir / f'accuracy_{eval_stage}.txt', 'w') as f:
        f.write(f"Accuracy: {accuracy}\n")
        f.write(f"Weighted F1: {f1}\n")
        f.write(f"Weighted Accuracy: {balanced_accuracy}\n")

    # Save the classification report
    report = compute_classification_report(y, y_pred, labels)
    with open(model_dir / f'classification_report_{eval_stage}.txt', 'w') as f:
        f.write(report)

    # Save the confusion matrix
    cm_path = model_dir / f'confusion_matrix_{eval_stage}.png'
    compute_confusion_matrix(y, y_pred, labels, save=True, save_path=cm_path)

In [30]:
# define early stopper -> w/out the need to test epoch; just need to define the patience
# and maximum epoches for the model

# definition moved to early_stopper.py

from early_stopper import EarlyStopper

In [31]:
for batch_size, lr in hyperparam_combs:
    print_log(f'-' * 50)
    print_log(f"Batch size: {batch_size}, Learning rate: {lr}")
    print_log(f'-' * 50)

    model_dir = top_model_dir/f'bs_{batch_size}_lr_{lr}'
    if not model_dir.exists():
        model_dir.mkdir(parents=True)
        print_log(f"Model directory {model_dir} created")
    
    model_name = f"{str(model_type)}_bs_{batch_size}_lr_{lr}"

    if Path(model_dir/model_name).exists():
        print_log(f"Model {model_dir/model_name} already exists, skipping...")
        continue
    
    # create dataloaders
    data_loader_train, data_loader_test, data_loader_valid = build_data_loaders(batch_size)

    # init model
    model = load_model(model_type, out_features=len(LABELS))
    model.to(DEVICE)

    # construct optimizer, learning rate scheduler etc.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(params, lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)       # reduce lr by 0.99 every epoch

    # early stopping
    # we monitor the testing accuracy
    early_stopper = EarlyStopper(patience=5, delta=0.005, minimize=False)

    loss_dict = {'train_loss': [], 'test_loss': []}
    acc_dict = {'train_acc': [], 'test_acc': []}

    # training
    for epoch in range(MAX_EPOCHES):
        model.train()
        print_log(f"Epoch {epoch+1}/{MAX_EPOCHES}")
        train_loss, train_acc, y_labels_train, y_pred_train = train_one_epoch(
            model,
            criterion,
            optimizer,
            data_loader_train,
        )
        loss_dict['train_loss'].append(train_loss)
        acc_dict['train_acc'].append(train_acc)

        lr_scheduler.step()

        model.eval()

        # run on test set for evaluation (get test set loss)
        with torch.no_grad():
            test_loss, test_acc, y_labels_test, y_pred_test = evaluate(
                model,
                criterion,
                data_loader_test,
            )
            loss_dict['test_loss'].append(test_loss)
            acc_dict['test_acc'].append(test_acc)

        early_stop = early_stopper.step(test_acc, epoch, model)

        if early_stop:
            print_log(f"Early stopping at epoch {epoch+1}")
            break

    print_log(f"Best epoch: {early_stopper.best_epoch+1}; Best Acc: {early_stopper.best_loss:.3f}")
    # get the best model
    best_model = early_stopper.get_best_model()
    best_epoch = early_stopper.best_epoch

    # save the best model
    model_name = model_name + f'_epoch_{best_epoch+1}.pth'
    torch.save(best_model.state_dict(), model_dir/model_name)
    print_log(f"Model saved to {model_dir/model_name}")

    # save the loss dict
    loss_dict_path = model_dir/'loss_dict.pkl'
    with open(loss_dict_path, 'wb') as f:
        pickle.dump(loss_dict, f)
    print_log(f"Loss dict saved to {loss_dict_path}")


    # evaluate the model on train, test and validation set
    best_model.eval()

    # train set
    # save_evaluations(y_labels_train, y_pred_train, LABELS, model_dir, eval_stage='train')
    # y_labels_train_path = model_dir / 'y_labels_train.pkl'
    # y_pred_train_path = model_dir / 'y_pred_train.pkl'
    # np.save(y_labels_train_path, y_labels_train)
    # np.save(y_pred_train_path, y_pred_train)

    y_labels_test, y_pred_test = evaluation_pred(best_model, data_loader_test, stage='test')
    save_evaluations(y_labels_test, y_pred_test, LABELS, model_dir, eval_stage='test')
    y_labels_test_path = model_dir / 'y_labels_test'
    y_pred_test_path = model_dir / 'y_pred_test'
    np.save(y_labels_test_path, y_labels_test)
    np.save(y_pred_test_path, y_pred_test)

    # save the model predictions
    y_labels_valid, y_pred_valid = evaluation_pred(best_model, data_loader_valid, stage='validation')
    save_evaluations(y_labels_valid, y_pred_valid, LABELS, model_dir, eval_stage='validation')
    y_labels_valid_path = model_dir / 'y_labels_valid'
    y_pred_valid_path = model_dir / 'y_pred_valid'
    np.save(y_labels_valid_path, y_labels_valid)
    np.save(y_pred_valid_path, y_pred_valid)

    print_log(f"Finished training for batch size: {batch_size}, learning rate: {lr}")
    print_log(f'-' * 50)

[2025-04-26 04:01:55:195] - --------------------------------------------------
[2025-04-26 04:01:55:195] - Batch size: 32, Learning rate: 0.0001
[2025-04-26 04:01:55:195] - --------------------------------------------------
[2025-04-26 04:01:55:195] - Model directory models_coco/efficientnet-b4/20250426-040154/bs_32_lr_0.0001 created
[2025-04-26 04:01:55:568] - Epoch 1/15


Training Loss: 1.836:   4%|▎         | 57/1562 [00:16<07:13,  3.47it/s]


KeyboardInterrupt: 

---

Load the pre-trained faster rcnn network

In [None]:
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# using mobile net for faster training, while have decent accuracy
model = fasterrcnn_mobilenet_v3_large_fpn()

n_classes = len(dataset_traintest.labels)

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)

In [None]:
from torchinfo import summary

summary(model, input_size=(8, 3, 224, 224))

Layer (type:depth-idx)                                  Output Shape              Param #
FasterRCNN                                              [100, 4]                  --
├─GeneralizedRCNNTransform: 1-1                         [8, 3, 800, 800]          --
├─BackboneWithFPN: 1-2                                  [8, 256, 13, 13]          --
│    └─IntermediateLayerGetter: 2-1                     [8, 960, 25, 25]          --
│    │    └─Conv2dNormActivation: 3-1                   [8, 16, 400, 400]         (432)
│    │    └─InvertedResidual: 3-2                       [8, 16, 400, 400]         (400)
│    │    └─InvertedResidual: 3-3                       [8, 24, 200, 200]         (3,136)
│    │    └─InvertedResidual: 3-4                       [8, 24, 200, 200]         (4,104)
│    │    └─InvertedResidual: 3-5                       [8, 40, 100, 100]         (9,960)
│    │    └─InvertedResidual: 3-6                       [8, 40, 100, 100]         (20,432)
│    │    └─InvertedResidual: 3-7

Training

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
BATCH_SIZE = 2

def collate_fn(batch):
  return tuple(zip(*batch))


dataset_train = torch.utils.data.Subset(dataset_traintest, train_indices)
dataset_test = torch.utils.data.Subset(dataset_traintest, test_indices)
# create data loaders
data_loader_train = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    collate_fn=collate_fn,
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn,
)

data_loader_valid = torch.utils.data.DataLoader(
    dataset_val,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn,
)

In [None]:
len(data_loader_train.dataset), len(data_loader_test.dataset), len(data_loader_valid.dataset)

(12674, 3169, 3559)

In [None]:
model.load_state_dict(torch.load('best_model.pth', weights_only=True))

<All keys matched successfully>

In [None]:
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
          )
        )
      )
      (2): InvertedResidual(
        (block):

In [None]:
N_EPOCHS = 3

# construct optimizer
params = [p for p in model.parameters() if p.requires_grad]
# optimizer = torch.optim.AdamW(params, lr=5e-3)
# following the tutorial first. Not sure if this is the best optimizer
optimizer = torch.optim.SGD(params, lr=0.001,
                        momentum=0.9,
                        weight_decay=0.0005)

# construct learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=N_EPOCHS, gamma=0.5)

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt

'''
Function to train the model over one epoch.
'''
def train_one_epoch(model, optimizer, data_loader, device):
  train_loss_list = []

  tqdm_bar = tqdm(data_loader, total=len(data_loader))
  for idx, data in enumerate(tqdm_bar):
    optimizer.zero_grad()
    images, targets = data

    # print(targets)

    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]  # targets = {'boxes'=tensor, 'labels'=tensor}

    losses = model(images, targets)

    loss = sum(loss for loss in losses.values())
    loss_val = loss.item()
    train_loss_list.append(loss.detach().cpu().numpy())

    loss.backward()
    optimizer.step()

    tqdm_bar.set_description(desc=f"Training Loss: {loss_val:.3f}")

  return train_loss_list

In [None]:
'''
Function to validate the model

The losses output will be a dictionary with the following keys (and sample values):
{'loss_classifier': tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.1033, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.1994, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0580, device='cuda:0', grad_fn=<DivBackward0>)}
'''

def evaluate(model, data_loader_test, device):
    val_loss_list = []

    tqdm_bar = tqdm(data_loader_test, total=len(data_loader_test))

    for i, data in enumerate(tqdm_bar):
        images, targets = data

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with torch.no_grad():
            losses = model(images, targets)

        loss = sum(loss for loss in losses.values())
        loss_val = loss.item()
        val_loss_list.append(loss_val)

        tqdm_bar.set_description(desc=f"Testing Loss: {loss:.4f}")
    return val_loss_list


In [None]:
'''
Function to plot training and valdiation losses and save them in `output_dir'
'''
def plot_loss(train_loss, valid_loss):
    figure_1, train_ax = plt.subplots()
    figure_2, valid_ax = plt.subplots()

    train_ax.plot(train_loss, color='blue')
    train_ax.set_xlabel('Iteration')
    train_ax.set_ylabel('Training Loss')

    valid_ax.plot(valid_loss, color='red')
    valid_ax.set_xlabel('Iteration')
    valid_ax.set_ylabel('Validation loss')

    # figure_1.savefig(f"{OUTPUT_DIR}/train_loss.png")
    # figure_2.savefig(f"{OUTPUT_DIR}/valid_loss.png")

In [None]:
from tqdm import tqdm

loss_dict = {'train_loss': [], 'test_loss': []}
best_model = None

for epoch in range(N_EPOCHS):
    print("----------Epoch {}----------".format(epoch+1))

    # Train the model for one epoch
    train_loss_list = train_one_epoch(model, optimizer, data_loader_train, device)
    loss_dict['train_loss'].extend(train_loss_list)

    lr_scheduler.step()

    # Run evaluation to get losses
    test_loss_list = evaluate(model, data_loader_test, device)
    loss_dict['test_loss'].extend(test_loss_list)

    # store the best model
    if best_model is None or min(test_loss_list) < min(loss_dict['test_loss']):
        best_model = model.state_dict()

    break

    # Svae the model ckpt after every epoch
    # ckpt_file_name = f"{OUTPUT_DIR}/epoch_{epoch+1}_model.pth"
    # torch.save({
    #     'epoch': epoch+1,
    #     'model_state_dict': model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'loss_dict': loss_dict
    # }, ckpt_file_name)


----------Epoch 1----------


Training Loss: 0.505: 100%|██████████| 6337/6337 [03:53<00:00, 27.12it/s]
Testing Loss: 0.1891: 100%|██████████| 1585/1585 [00:43<00:00, 36.24it/s]


In [None]:
# save the best model
torch.save(best_model, "best_model.pth")

In [None]:
_img, _target = dataset_val[0]

In [None]:
_target

{'boxes': tensor([[250.8200, 168.2600, 320.9300, 233.1400],
         [435.3500, 294.2300, 448.8100, 302.0400],
         [447.4400, 293.9100, 459.6000, 301.5600],
         [460.5900, 291.7100, 473.3400, 300.1600],
         [407.0700, 287.2500, 419.7200, 297.1100],
         [618.0600, 289.3100, 629.6600, 297.2600],
         [512.3000, 294.0700, 533.4800, 299.6400],
         [285.5500, 370.5600, 297.6200, 389.7700]]),
 'labels': tensor([1, 2, 2, 2, 2, 2, 2, 1]),
 'area': tensor([4548.7363,  105.1225,   93.0240,  107.7377,  124.7288,   92.2199,
          117.9727,  231.8647]),
 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 'image_id': tensor([532481])}

In [None]:
model.eval()

# _images = list(image.to(device) for image in images)
_targets = [{k: v.to(device) for k, v in t.items()} for t in [_target]]

prediction = model([_img.to(device)], _targets)

In [None]:
prediction

[{'boxes': tensor([[228.5561, 162.7497, 324.4900, 276.3049],
          [ 60.8669,  42.7334,  97.8022, 129.5400],
          [269.1109, 163.7460, 309.8487, 258.3119],
          [238.0428, 165.0763, 313.6662, 229.2963],
          [ 60.8862,  45.8456,  81.3751, 123.9119],
          [248.5014, 163.1061, 295.5199, 283.9343],
          [ 72.5282,  50.9944,  89.5937, 127.8398],
          [203.8169, 133.1695, 346.3714, 384.7503],
          [ 52.4172,  43.8782, 117.0351,  99.8453],
          [175.4869, 172.9174, 350.4914, 263.5399],
          [ 45.7011,  34.9295, 120.2405, 134.7366],
          [163.5344, 159.9768, 365.1031, 228.9554],
          [260.2890, 150.8506, 333.2132, 360.5005],
          [222.7567, 193.5051, 319.2958, 251.7576],
          [294.0408, 186.9751, 315.8139, 245.1031],
          [  0.0000,  44.0493, 103.7229, 124.7355]], device='cuda:0',
         grad_fn=<StackBackward0>),
  'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2], device='cuda:0'),
  'scores': tensor

---