First we need to download the data and unpack the zip file. The dataset is very large, so downloading takes quite some time.

In [1]:
import os
import zipfile
from xml.etree import ElementTree as et



if not os.path.exists('data'):
    if not os.path.exists('HollywoodHeads.zip'):
        print('Downloading dataset, this might take a while')
        !wget https://www.di.ens.fr/willow/research/headdetection/release/HollywoodHeads.zip
    with zipfile.ZipFile('HollywoodHeads.zip') as file:
        print('Unzipping dataset')
        file.extractall()
    os.rename('HollywoodHeads','data')
    !rm HollywoodHeads.zip

In [2]:
from functools import partial
import torch
from torch import nn
import torchvision
import torch.optim as optim
from torchvision.transforms import v2 as tf
from torchvision import tv_tensors
from torchvision.io import ImageReadMode
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection.retinanet import RetinaNet, RetinaNetHead
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.models.detection import backbone_utils

class HollywoodHeadDataset(Dataset):
    def __init__(self, root, transforms=None, mode='train') -> None:
        super().__init__()
        assert mode.lower() in ['train', 'test', 'val']
        self.transforms = transforms
        self.root = root

        filename = mode.lower() + '.txt'
        filepath = os.path.join(root,'Splits',filename)

        with open(filepath,'r') as f:
            img_names = f.readlines()
        self.imgs = [img.strip('\n') for img in img_names]

        self.imgs_dir = os.path.join(root, 'JPEGImages')
        self.annot_dir = os.path.join(root, 'Annotations')
        #self.classes = ['background','head']

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_filename = self.imgs[idx]+'.jpeg'
        image_path = os.path.join(self.imgs_dir,img_filename)

        annot_filename = self.imgs[idx]+'.xml'
        annot_file_path = os.path.join(self.annot_dir,annot_filename)

        img = torchvision.io.read_image(image_path, ImageReadMode.RGB)        
        boxes=[]
        #labels=[]
        tree = et.parse(annot_file_path)
        root = tree.getroot()
        for object in root.findall('object'):
            #labels.append(self.classes.index(object.find('name').text))
            if object.find('bndbox') is not None:
                xmin=float(object.find('bndbox').find('xmin').text)
                xmax=float(object.find('bndbox').find('xmax').text)

                ymin=float(object.find('bndbox').find('ymin').text)
                ymax=float(object.find('bndbox').find('ymax').text)

                boxes.append([xmin,ymin, xmax,ymax])
            # except AttributeError:
            #     continue

        #area = (boxes[:,2]-boxes[:,1])*(boxes[:,4]-boxes[:,3])
        img = tv_tensors.Image(img)
        #print(img.shape)
        boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=img.shape[-2:])
        #iscrowd = torch.zeros(boxes.shape[0],dtype=torch.int64)
        
        #We only have one class
        labels=torch.ones(boxes.shape[0], dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        #target["area"] = area
        #target["iscrowd"] = iscrowd
        target["image_id"] = torch.tensor([idx])

        if self.transforms is not None:
            img, target = self.transforms(img, target)
        return img, target

In [3]:
transforms = tf.Compose([tf.Resize((255,255)),
                        tf.ToImage(),
                        tf.ConvertImageDtype()])

train_set = HollywoodHeadDataset(root='data', mode='train', transforms=transforms)
test_set = HollywoodHeadDataset(root='data', mode='test', transforms=transforms)
val_set = HollywoodHeadDataset(root='data', mode='val', transforms=transforms)

print(f"Length of train set = {len(train_set)}, test set = {len(test_set)}, validation set = {len(val_set)}")

Length of train set = 216719, test set = 1302, validation set = 6719


In [5]:
bs_train=32
bs = 16

def collate_fn(batch):
    images = []
    targets = []
    for b in batch:
        images.append(b[0])
        targets.append(b[1])
    images = torch.stack(images,dim=0)
    return images, targets

train_loader = DataLoader(train_set, batch_size=bs_train, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=bs, shuffle=True, collate_fn=collate_fn)

In [6]:
import time

def train_epoch(model, dataloader, optimizer, report_freq, device):
    epoch_loss=[]
    running_loss = 0.0
    for j, (images, targets) in enumerate(dataloader):
            batch_starttime = time.time()
            optimizer.zero_grad(set_to_none=True)
            for target in targets:
                 for key in target.keys():
                      target[key].to(device)
            #print(f"start calculating loss")
            batch_loss_dict = model(images.to(device),targets)
            #print(f"Calculated loss dictionary, "
            #      f"time = {int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds.")
            batch_loss = sum(loss for loss in batch_loss_dict.values())
            #print(f"Calculated the sum over losses, "
            #      f"time = {int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds, ")
            batch_loss.backward()
            #print(f"Finished backward calculation, "
            #      f"time = {int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds, ")
            optimizer.step()
            #print(f"Finished optimizer step, "
            #      f"time = {int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds, ")
            epoch_loss.append(batch_loss.item())
            running_loss+=batch_loss.item()
            if j%report_freq==report_freq-1:
                print(f"Batch {j+1} finished, "
                      f"time = {int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds, "
                      f"loss: {running_loss/report_freq}")
                running_loss = 0.0
            else:
                print(f"Batch {j+1} finished, time = "
                      f"{int((time.time()-batch_starttime)/60)} minutes {round((time.time()-batch_starttime)%60,2)} seconds")
    return epoch_loss

def train_model(model,train_loader, test_loader, optimizer,no_of_epochs, report_freq, device='cpu'):
    train_loss = []
    val_loss = []
    accuracy = []
    for epoch in range(no_of_epochs):
        epoch_starttime = time.time()
        print(f"START TRAINING FOR EPOCH {epoch + 1}:")
        model.train(True)
        epoch_loss = train_epoch(model, train_loader, optimizer, report_freq, device)
        train_loss+=epoch_loss
            
        running_vloss = 0.0
        model.eval()
        print(f"Training for epoch {epoch+1} done, time = "
              f"{int((time.time()-epoch_starttime)/60)} minutes {round((time.time()-epoch_starttime)%60,2)} seconds")
        with torch.no_grad():
             for i, (vimages, vtargets) in enumerate(test_loader):
                  vbatch_starttime = time.time()
                  vloss_dict = model(vimages, vtargets)
                  vloss = sum(loss for loss in vloss_dict.values())
                  #correct = (torch.argmax(vpred, dim=1) == vlabels).type(torch.FloatTensor)
                  val_loss.append(vloss.item())
                  running_vloss+=vloss.item()
                  #accuracy.append(correct.mean().item())
                  print(f"Completed validation for batch {i+1}, time = "
                        f"{int((time.time()-vbatch_starttime)/60)} minutes {round((time.time()-vbatch_starttime)%60,2)}"
                        f"seconds")
        

        val_loss.append(running_vloss/(i+1))
        train_loss+=epoch_loss
        print(f"Validation for epoch {epoch+1} done, time = "
              f"{int((time.time()-epoch_starttime)/60)} minutes {round((time.time()-epoch_starttime)%60,2)} seconds, "
              f"LOSS train {epoch_loss[-1]}, val: {val_loss[-1]}")

        
    return train_loss, val_loss#, accuracy

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [8, 16, 32, 64, 128])
aspect_ratios = ((0.25, 0.5, 1.0, 1.5),)*len(anchor_sizes)
anchor_generator = AnchorGenerator(anchor_sizes,aspect_ratios)

trainable_backbone_layers=3
backbone = torchvision.models.resnet50(weights='DEFAULT')
backbone = backbone_utils._resnet_fpn_extractor(backbone, 
                                                trainable_layers=trainable_backbone_layers,
                                                returned_layers=[2,3,4],
                                                extra_blocks=torchvision.ops.feature_pyramid_network.LastLevelP6P7(2048,256))

backbone.out_channels=256
head = RetinaNetHead(backbone.out_channels,
                     anchor_generator.num_anchors_per_location()[0],
                     num_classes=2,
                     norm_layer=partial(nn.GroupNorm, 32))

head.regression_head._loss_type = "giou"
model = RetinaNet(backbone=backbone, 
                  num_classes=2, 
                  anchor_generator=anchor_generator,
                  head=head)
model.to(device)

print(model)
 




RetinaNet(
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      

In [9]:
params = [p for p in model.parameters() if p.requires_grad]
print(len(params))
optimizer = optim.SGD(params,lr=0.001,momentum=0.9)

no_of_epochs = 3
train_loss, test_loss = train_model(model, train_loader,test_loader, optimizer=optimizer, no_of_epochs=no_of_epochs, report_freq=1)

170
START TRAINING FOR EPOCH 1:




In [None]:
import random

annot_dir = 'data/Annotations/'
img_dir = 'data/JPEGImages/'
imgs = val_set.imgs
model.eval()
for i in range(10):
    img_name = random.choice(imgs)
    annot_filepath = os.path.join(annot_dir, img_name+'.xml')
    img_filepath = os.path.join(img_dir,img_name+'.jpeg')
    img = torchvision.io.read_image(img_filepath,ImageReadMode.RGB)

    img = tv_tensors.Image(img)
    pred = model(img)
    boxes = pred["boxes"]


    #boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=img.shape[-2:])
    img = torchvision.utils.draw_bounding_boxes(img, boxes)
    img = torchvision.transforms.ToPILImage()(img)
    img.show()