# Load The Libraries

In [7]:
import sys
sys.path.append("./model/")

from utils import *
from model.mutibox_loss import MultiBoxLoss
from model.metrics.metric import Metrics

import torch
import torchvision
import torch.nn.functional as F
from torch import nn

import cv2
import json
import PIL
import os

import torch.backends.cudnn as cudnn
import torch.optim as optim

from torchvision.transforms import ToTensor, Compose, Resize, Normalize, Lambda
from torch.utils.data import DataLoader, random_split

torch.autograd.set_detect_anomaly(True)

ModuleNotFoundError: No module named 'cv2'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(device)

# Build the Model

## Build The Backbone

In [None]:
class backbone_vgg16(nn.Module):
    def __init__(self, fine_tune = True):
        super(backbone_vgg16_module, self).__init__()
        backbone = torchvision.models.vgg16(weights="IMAGENET1K_V1")
        
        if fine_tune == False
            for param in backbone.parameters():
                param.requires_grad = False
                
        feature_maps = list(backbone.children())[0]
        
        self.feature_1 = nn.Sequential(*feature_maps[:23])
        self.feature_2 = nn.Sequential(*feature_maps[23:])
        self.conv_6 = nn.Conv2d(512, 1024, kernel_size= (3, 3), padding= 6, dilation= 6)
        self.conv_7 = nn.Conv2d(1024, 1024, kernel_size= (1, 1))
        
    def forward(self, image):
        x = image
        x = self.feature_1(x)
        out_1 = x
        
        x = self.feature_2(x)
        x = F.relu(self.conv_6(x))
        out_2 = F.relu(self.conv_7(x))
        
        return out_1, out_2

## Build the Neck

In [None]:
class extra_feature_layers(nn.Module):
    def __init__(self):
        super(extra_feature_layers, self).__init__()
        
        self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=(1, 1), padding= 0)
        self.conv8_2 = nn.Conv2d(256, 512, kernel_size= (3, 3), padding= 1, stride= 2)
        
        self.conv9_1 = nn.Conv2d(512, 128, kernel_size= (1, 1), padding= 0)
        self.conv9_2 = nn.Conv2d(128, 256, kernel_size= (3, 3), padding= 1, stride= 2)
        
        self.conv10_1 = nn.Conv2d(256, 128, kernel_size= (1, 1), padding= 0)
        self.conv10_2 = nn.Conv2d(128, 256, kernel_size= (3, 3), padding= 0)
        
        self.conv11_1 = nn.Conv2d(256, 128, kernel_size= (1, 1), padding= 0)
        self.conv11_2 = nn.Conv2d(128, 256, kernel_size= (3, 3), padding= 0)
        
        self.weights_init()
        
    def forward(self, x):
        x = F.relu(self.conv8_1(x))                                #(N, 256, 19, 19)
        x = F.relu(self.conv8_2(x))                                #(N, 512, 10, 10)
        conv8_2_out = x
        
        x = F.relu(self.conv9_1(x))                                #(N, 128, 10, 10)
        x = F.relu(self.conv9_2(x))                                #(N, 256, 5, 5)
        conv9_2_out = x
        
        x = F.relu(self.conv10_1(x))                               #(N, 128, 5, 5)
        x = F.relu(self.conv10_2(x))                               #(N, 256, 3, 3)
        conv10_2_out = x
        
        x = F.relu(self.conv11_1(x))                               #(N, 128, 3, 3)
        conv11_2_out = F.relu(self.conv11_2(x))                    #(N, 256, 1, 1)
        
        return conv8_2_out, conv9_2_out, conv10_2_out, conv11_2_out
    
    def weights_init(self):
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.zeros_(c.bias)

In [None]:
class L2_Norm(nn.Module):
    def __init__(self, channels, scale):
        super(L2Norm, self).__init__()
        
        self.channels = channels
        self.scale = scale
        self.rescale_factors = nn.Parameter(torch.FloatTensor(1, channels, 1, 1))
        
        self.reset_params()
        
    def reset_params(self):
        nn.init.constant_(self.rescale_factors, self.scale)
    
    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()
        x = x / norm
        out = x * self.rescale_factors
        return out

## Build The Head

In [None]:
def detection_layer(nn.Module):
    def __init__(self, n_classes):
        super(detection_layer, self).__init__()
        
        num_prior_boxes = {"backbone_1": 4, "backbone_2":6, "conv8_2":6, "conv9_2":6, "conv10_2":4, "conv11_2":4}
        
        # localizer heads
        self.backbone_1_loc = nn.Conv2d(512, num_prior_boxes["backbone_1"]*4,  kernel_size= (3, 3), padding= 1)
        self.backbone_2_loc = nn.Conv2d(1024, num_prior_boxes["backbone_2"]*4,  kernel_size= (3, 3), padding= 1)
        self.conv8_2_loc = nn.Conv2d(512, num_prior_boxes["conv8_2"]*4,  kernel_size= (3, 3), padding= 1)
        self.conv9_2_loc = nn.Conv2d(256, num_prior_boxes["conv9_2"]*4,  kernel_size= (3, 3), padding= 1)
        self.conv10_2_loc = nn.Conv2d(1024, num_prior_boxes["conv10_2"]*4,  kernel_size= (3, 3), padding= 1)
        self.conv11_2_loc = nn.Conv2d(1024, num_prior_boxes["conv11_2"]*4,  kernel_size= (3, 3), padding= 1)
        
        # classifier heads
        self.backbone_1_cls = nn.Conv2d(512, num_prior_boxes["backbone_1"]*n_classes,  kernel_size= (3, 3), padding= 1)
        self.backbone_2_cls = nn.Conv2d(1024, num_prior_boxes["backbone_2"]*n_classes,  kernel_size= (3, 3), padding= 1)
        self.conv8_2_cls = nn.Conv2d(512, num_prior_boxes["conv8_2"]*n_classes,  kernel_size= (3, 3), padding= 1)
        self.conv9_2_cls = nn.Conv2d(256, num_prior_boxes["conv9_2"]*n_classes,  kernel_size= (3, 3), padding= 1)
        self.conv10_2_cls = nn.Conv2d(1024, num_prior_boxes["conv10_2"]*n_classes,  kernel_size= (3, 3), padding= 1)
        self.conv11_2_cls = nn.Conv2d(1024, num_prior_boxes["conv11_2"]*n_classes,  kernel_size= (3, 3), padding= 1)
        
        self.weights_init()
    
    def forward(self, backbone_1_out, backbone_2_out, conv8_2_out, conv9_2_out,
                conv10_2_out, conv11_2_out):

        batch_size = backbone_1_out.size(0)
        
        ### Bounding box prediction
        loc_backbonet_1 = self.backbone_1_loc(backbone_1_out)                        #(N, 16, 38, 38)
        loc_backbone_1 = loc_backbone_1.permute(0, 2, 3, 1).contiguous()            #(N, 38, 38, 16)
        loc_backbone_1 = loc_backbone_1.view(batch_size, -1, 4)                     #(N, 5776, 4)
        assert loc_backbone_1.size(1) == 5776
        
        loc_backbone_2 = self.backbone2_loc(backbone_2_out)                                 #(N, 24, 19, 19)
        loc_backbone_2 = loc_backbone_2.permute(0, 2, 3, 1).contiguous()                #(N, 19, 19, 24)
        loc_backbone_2 = loc_backbone_2.view(batch_size, -1, 4)                         #(N, 2166, 4)
        assert loc_backbone_2.size(1) == 2166
        
        loc_conv8_2 = self.conv8_2_loc(conv8_2_out)                           #(N, 24, 10, 10)
        loc_conv8_2 = loc_conv8_2.permute(0, 2, 3, 1).contiguous()            #(N, 10, 10, 24)
        loc_conv8_2 = loc_conv8_2.view(batch_size, -1, 4)                     #(N, 600, 4)
        assert loc_conv8_2.size(1) == 600
        
        loc_conv9_2= self.conv9_2_loc(conv9_2_out)                            #(N, 24, 5, 5)
        loc_conv9_2 = loc_conv9_2.permute(0, 2, 3, 1).contiguous()            #(N, 5, 5, 24)
        loc_conv9_2 = loc_conv9_2.view(batch_size, -1, 4)                     #(N, 150, 4)
        assert loc_conv9_2.size(1) == 150
        
        loc_conv10_2 = self.conv10_2_loc(conv10_2_out)                      #(N, 16, 3, 3)
        loc_conv10_2 = loc_conv10_2.permute(0, 2, 3, 1).contiguous()          #(N, 3, 3, 16)
        loc_conv10_2 = loc_conv10_2.view(batch_size, -1, 4)                   #(N, 36, 4)
        assert loc_conv10_2.size(1) == 36
        
        loc_conv11_2 = self.conv11_2_loc(conv11_2_out)                      #(N, 16, 1, 1)
        loc_conv11_2 = loc_conv11_2.permute(0, 2, 3, 1).contiguous()          #(N, 1, 1, 16)
        loc_conv11_2 = loc_conv11_2.view(batch_size, -1, 4)                   #(N, 4, 4)
        assert loc_conv11_2.size(1) == 4

        ### Classifiers
        cls_backbone_1 = self.backbone_1_cls(backbone_1_out)                         #(N, 4*classes, 38, 38)
        cls_backbone_1 = cls_backbone_1.permute(0, 2, 3, 1).contiguous()            #(N, 38, 38, 4*classes)
        cls_backbone_1 = cls_backbone_1.view(batch_size, -1, self.n_classes)      #(N, 5776, classes )
        assert cls_backbone_1.size(1) == 5776
        
        cls_backbone_2 = self.backbone_2_cls(backbone_2_out)                               #(N, 6*classes, 19, 19)
        cls_backbone_2 = cls_backbone_2.permute(0, 2, 3, 1).contiguous()                #(N, 19, 19, 6*classes)
        cls_backbone_2 = cls_backbone_2.view(batch_size, -1, self.n_classes)          #(N, 2166, classes)
        assert cls_backbone_2.size(1) == 2166        
        
        cls_conv8_2 = self.conv8_2_cls(conv8_2_out)                         #(N, 6*clases, 10, 10)
        cls_conv8_2 = cls_conv8_2.permute(0, 2, 3, 1).contiguous()            #(N, 10, 10, 6*classes)
        cls_conv8_2 = cls_conv8_2.view(batch_size, -1, self.n_classes)      #(N, 600, classes)
        assert cls_conv8_2.size(1) == 600
        
        cls_conv9_2 = self.conv9_2_cls(conv9_2_out)                         #(N, 6*classes, 5, 5)
        cls_conv9_2 = cls_conv9_2.permute(0, 2, 3, 1).contiguous()            #(N, 5, 5, 6*classes)
        cls_conv9_2 = cls_conv9_2.view(batch_size, -1, self.n_classes)      #(N, 150, classes)
        assert cls_conv9_2.size(1) == 150
        
        cls_conv10_2 = self.conv10_2_cls(conv10_2_out)                      #(N, 4*classes, 3, 3)
        cls_conv10_2 = cls_conv10_2.permute(0, 2, 3, 1).contiguous()          #(N, 3, 3, 4*classes)
        cls_conv10_2 = cls_conv10_2.view(batch_size, -1, self.n_classes)    #(N, 36, classes)
        assert cls_conv10_2.size(1) == 36
        
        cls_conv11_2 = self.conv11_2_cls(conv11_2_out)                      #(N, 4*classes, 1, 1)
        cls_conv11_2 = cls_conv11_2.permute(0, 2, 3, 1).contiguous()          #(N, 1, 1, 4*classes)
        cls_conv11_2 = cls_conv11_2.view(batch_size, -1, self.n_classes)    #(N, 4, classes)
        assert cls_conv11_2.size(1) == 4
        
        ### All predictions
        locs_pred = torch.cat([loc_backbone_out_1, loc_backbone_out_2, loc_conv8_2, loc_conv9_2,
                               loc_conv10_2, loc_conv11_2], dim= 1)    #(N, 8732, 4)
        assert locs_pred.size(0) == batch_size
        assert locs_pred.size(1) == 8732
        assert locs_pred.size(2) == 4
    
        cls_pred = torch.cat([cls_backbone_out_1, cls_backbone_out_2, cls_conv8_2, cls_conv9_2,
                              cls_conv10_2, cls_conv11_2], dim= 1)    #(N, 8732, classes)
        assert cls_pred.size(0) == batch_size
        assert cls_pred.size(1) == 8732
        assert cls_pred.size(2) == self.n_classes
        
        return locs_pred, cls_pred
    
    def weights_init(self):
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.zeros_(c.bias)

In [None]:
def SSD(nn.Module):
    def __init__(self, n_classes):
        super(SSD, self).__init__()
        
        self.n_classes = n_classes
        self.backbone = backbone_vgg16(fine_tune = True)
        self.neck = extra_feature_layers()
        self.head = detection_layer(n_classes)
        
        #Rescale factor for conv4_3, it is learned during back-prop
        self.L2_Norm = L2_Norm(channels= 512, scale= 20)
        
        #Prior boxes coordinate cx, cy, w, h
        self.default_boxes = self.create_default_boxes()
        
    def forward(self, image):
        backbone_out_1, backbone_out_2 = self.backbone(image)
        
        backbone_out_1_norm = self.L2_Norm(backbone_out_1)
        
        neck_out_1, neck_out_2, neck_out_3, neck_out_4 = self.neck(backbone_out_2)
        
        locs_pred, cls_pred = self.head(backbone_out_1, backbone_out_2, 
                                        neck_out_1, neck_out_2, neck_out_3, neck_out_4)    #(N, 8732, 4) #(N, 8732, classes)
        
        return locs_pred, cls_pred
    
    def create_default_boxes(self):
            fmap_wh = {"conv4_3": 38, "conv7": 19, "conv8_2": 10, "conv9_2": 5,
                       "conv10_2": 3, "conv11_2": 1}

            scales = {"conv4_3": 0.1, "conv7": 0.2, "conv8_2": 0.375,
                      "conv9_2": 0.55, "conv10_2": 0.725, "conv11_2": 0.9}

            aspect_ratios= {"conv4_3": [1., 2., 0.5], "conv7": [1., 2., 3., 0.5, 0.3333],
                            "conv8_2": [1., 2., 3., 0.5, 0.3333], 
                            "conv9_2": [1., 2., 3., 0.5, 0.3333],
                            "conv10_2": [1., 2., 0.5], "conv11_2": [1., 2., 0.5]}

            fmaps = list(fmap_wh.keys())

            default_boxes = []
            for k, fmap in enumerate(fmaps):
                for i in range(fmap_wh[fmap]):
                    for j in range(fmap_wh[fmap]):
                        cx = (j + 0.5) / fmap_wh[fmap]
                        cy = (i + 0.5) / fmap_wh[fmap]

                        for ratio in aspect_ratios[fmap]:
                            default_boxes.append([cx, cy, scales[fmap]* math.sqrt(ratio), 
                                                  scales[fmap]/math.sqrt(ratio)]) #(cx, cy, w, h)

                            if ratio == 1:
                                try:
                                    add_scale = math.sqrt(scales[fmap]*scales[fmaps[k+1]])
                                except IndexError:
                                    #for the last feature map
                                    add_scale = 1.
                                default_boxes.append([cx, cy, add_scale, add_scale])

            default_boxes = torch.FloatTensor(default_boxes).to(device) #(8732, 4)
            default_boxes.clamp_(0, 1)
            assert default_boxes.size(0) == 8732
            assert default_boxes.size(1) == 4
            return default_boxes
        
    def detect(self, locs_pred, cls_pred, min_score, max_overlap, top_k):

        batch_size = locs_pred.size(0)    #N
        n_default_boxes = self.default_boxes.size(0)    #8732
        cls_pred = F.softmax(cls_pred, dim= 2)    #(N, 8732, n_classes)
        assert n_default_boxes == locs_pred.size(1) == cls_pred.size(1)
        
        all_images_boxes = []
        all_images_labels = []
        all_images_scores = []
        
        for i in range(batch_size):
            #Decode object
            decoded_locs = cxcy_to_xy(decode_bboxes(locs_pred[i], self.default_boxes)) #(8732, 4)
            
            image_boxes = []
            image_labels = []
            image_scores = []
            
            max_scores, best_label = cls_pred[i].max(dim= 1)    #(8732)
            
            #Check for each class
            for c in range(1, self.num_classes):
                class_scores = cls_pred[i][:, c]    #8732
                score_above_min_score = class_scores > min_score
                n_above_min_score = score_above_min_score.sum().item()
                
                if n_above_min_score == 0:
                    continue
                
                class_scores = class_scores[score_above_min_score]    # <=8732
                class_decoded_locs = decoded_locs[score_above_min_score] # <=8732
                
                #Sort pred boxes and socores by scores
                class_scores, sort_id = class_scores.sort(dim= 0, descending= True)
                class_decoded_locs = class_decoded_locs[sort_id]
                
                #Find overlap between pred locs
                overlap = find_IoU(class_decoded_locs, class_decoded_locs)
                
                #Apply NMS
                suppress = torch.zeros((n_above_min_score), dtype=torch.uint8).to(device)
                
                for box_id in range(class_decoded_locs.size(0)):
                    if suppress[box_id] == 1:
                        continue
                    condition = overlap[box_id] > max_overlap
                    condition = torch.tensor(condition, dtype=torch.uint8).to(device)
                    suppress = torch.max(suppress, condition)
                    
                    suppress[box_id] = 0
                
                # Store only unsuppressed boxes for this class
                image_boxes.append(class_decoded_locs[1 - suppress])
                image_labels.append(torch.LongTensor((1 - suppress).sum().item() * [c]).to(device))
                image_scores.append(class_scores[1 - suppress])
            
            if len(image_boxes) == 0:
                image_boxes.append(torch.FloatTensor([[0., 0., 1., 1.]]).to(device))
                image_labels.append(torch.LongTensor([0]).to(device))
                image_scores.append(torch.FloatTensor([0.]).to(device))
            
            #Concat into single tensors
            image_boxes = torch.cat(image_boxes, dim= 0)    #(n_objects, 4)
            image_labels = torch.cat(image_labels, dim=0)  # (n_objects)
            image_scores = torch.cat(image_scores, dim=0)  # (n_objects)
            n_objects = image_scores.size(0)
            
            #Keep only the top k objects
            if n_objects > top_k:
                image_scores, sort_index = image_scores.sort(dim=0, descending=True)
                image_scores = image_scores[:top_k]  # (top_k)
                image_boxes = image_boxes[sort_index][:top_k]  # (top_k, 4)
                image_labels = image_labels[sort_index][:top_k]  # (top_k)
            
            all_images_boxes.append(image_boxes)
            all_images_labels.append(image_labels)
            all_images_scores.append(image_scores)
            
        return all_images_boxes, all_images_labels, all_images_scores        

# Load Training Data

In [9]:
! git clone https://github.com/PythonAPI/pycocotools/coco.py

Cloning into 'coco.py'...
remote: Not Found
fatal: repository 'https://github.com/PythonAPI/pycocotools/coco.py/' not found


In [3]:
EPOCH = 400
BATCH_SIZE = 8
NUM_CLASSES = 101
LEARNING_RATE = 0.001
MOMENTUM = 0.9
EARLY_STOP_THRESHOLD = 0.00001
EARLY_STOP_PATIENCE = 20

In [4]:
from torchvision.datasets import CocoDetection

train_data_dir = 'coco-2017/train2017'
train_ann_file = 'coco-2017/annotations_trainval2017/annotations/nstances_train2017.json'
val_data_dir = 'coco-2017/val2017'
val_ann_file = 'coco-2017/annotations/instances_val2017.json'

In [5]:
from torchvision.transforms import ToTensor, Compose, Normalize

transform = Compose([ToTensor(),
                     Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [7]:
train_dataset = CocoDetection(root=train_data_dir, annFile=train_ann_file, transform=transform)
test_dataset = CocoDetection(root=val_data_dir, annFile=val_ann_file, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

ModuleNotFoundError: No module named 'pycocotools'

In [11]:
from pycocotools.coco import COCO

ModuleNotFoundError: No module named 'pycocotools'

# Training