In [1]:
# from model import SSD300, ResNet, Loss
import torch
import pandas as pd
import numpy as np 
import os

import os
import sys
import os.path

import random
import numpy as np

import torch
import torch.utils.data as data
import torchvision.transforms as transforms

from PIL import Image, ImageOps

import os
import argparse
import itertools

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/gulvarol/grocerydataset/master/annotations.csv",
                 names=["names", "x", "y", "w", "h", "class"])

df['class'].replace(df["class"].unique(), '1')

path = "./dataset/ShelfImages/"
tr_path = os.path.join(path, "train/")
ts_path = os.path.join(path, "test/")


In [3]:
import itertools

class DataEncoder:
    def __init__(self):
        '''Compute default box sizes with scale and aspect transform.'''
        scale = 300.
        steps = [s / scale for s in (8, 16, 32, 64, 100, 300)]
        sizes = [s / scale for s in (30, 60, 111, 162, 213, 264, 315)]
        aspect_ratios = ((2,), (2,3), (2,3), (2,3), (2,), (2,))
        feature_map_sizes = (38, 19, 10, 5, 3, 1)

        num_layers = len(feature_map_sizes)

        boxes = []


        mean = []
        x = 0

        for i in range(num_layers):
            fmsize = feature_map_sizes[i]

            for h,w in itertools.product(range(fmsize), repeat=2):
                cx = (w+0.5)*steps[i]
                cy = (h+0.5)*steps[i]
                bbox = [cx,cy, 0.109, 0.12]
                boxes.append(bbox)
        
        self.default_boxes = torch.Tensor(boxes)



    def iou(self, box1, box2):
        '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].

        Args:
          box1: (tensor) bounding boxes, sized [N,4].
          box2: (tensor) bounding boxes, sized [M,4].

        Return:
          (tensor) iou, sized [N,M].
        '''
        N = box1.size(0)
        M = box2.size(0)

        lt = torch.max(
            box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )

        rb = torch.min(
            box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )

        wh = rb - lt  # [N,M,2]
        wh[wh<0] = 0  # clip at 0
        inter = wh[:,:,0] * wh[:,:,1]  # [N,M]

        area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
        area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

        iou = inter / (area1 + area2 - inter)
        return iou

    def encode(self, boxes, classes, threshold=0.5):
        '''Transform target bounding boxes and class labels to SSD boxes and classes.

        Match each object box to all the default boxes, pick the ones with the
        Jaccard-Index > 0.5:
            Jaccard(A,B) = AB / (A+B-AB)

        Args:
          boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
          classes: (tensor) object class labels of a image, sized [#obj,].
          threshold: (float) Jaccard index threshold

        Returns:
          boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
          classes: (tensor) class labels, sized [8732,]
        '''
#         print(f"Default box shape: {self.default_boxes.shape}")
        default_boxes = self.default_boxes
        num_default_boxes = default_boxes.size(0)
        num_objs = boxes.size(0)
        
        
        iou = self.iou(  # [#obj,8732]
            boxes,
            torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
                       default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
        )

#         print(f'iou shape: {iou.shape}')

        
        iou, max_idx = iou.max(0)  # [1,8732]

        max_idx.squeeze_(0)        # [8732,]
        iou.squeeze_(0)            # [8732,]
        
#         print(f'iou sq shape: {iou.shape}')

        
        boxes = boxes[max_idx]     # [8732,4]
        

#         print(f'def box shape: {boxes.shape}')
         
        variances = [0.1, 0.2]
        cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
        cxcy /= variances[0] * default_boxes[:,2:]
        wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
        wh = torch.log(wh) / variances[1]
        loc = torch.cat([cxcy, wh], 1)  # [8732,4]

        conf = 1 + classes[max_idx]   # [8732,], background class = 0
        conf[iou<threshold] = 0       # background
        return loc, conf

    def nms(self, bboxes, scores, threshold=0.5, mode='union'):
        '''Non maximum suppression.

        Args:
          bboxes: (tensor) bounding boxes, sized [N,4].
          scores: (tensor) bbox scores, sized [N,].
          threshold: (float) overlap threshold.
          mode: (str) 'union' or 'min'.

        Returns:
          keep: (tensor) selected indices.

        Ref:
          https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
        '''
        x1 = bboxes[:,0]
        y1 = bboxes[:,1]
        x2 = bboxes[:,2]
        y2 = bboxes[:,3]

        areas = (x2-x1) * (y2-y1)
        _, order = scores.sort(0, descending=True)

        keep = []
        while order.numel() > 0:
            i = order[0]
            keep.append(i)

            if order.numel() == 1:
                break

            xx1 = x1[order[1:]].clamp(min=x1[i])
            yy1 = y1[order[1:]].clamp(min=y1[i])
            xx2 = x2[order[1:]].clamp(max=x2[i])
            yy2 = y2[order[1:]].clamp(max=y2[i])

            w = (xx2-xx1).clamp(min=0)
            h = (yy2-yy1).clamp(min=0)
            inter = w*h

            if mode == 'union':
                ovr = inter / (areas[i] + areas[order[1:]] - inter)
            elif mode == 'min':
                ovr = inter / areas[order[1:]].clamp(max=areas[i])
            else:
                raise TypeError('Unknown nms mode: %s.' % mode)

            ids = (ovr<=threshold).nonzero().squeeze()
            if ids.numel() == 0:
                break
            order = order[ids+1]
        return torch.LongTensor(keep)

    def decode(self, loc, conf):
        '''Transform predicted loc/conf back to real bbox locations and class labels.

        Args:
          loc: (tensor) predicted loc, sized [8732,4].
          conf: (tensor) predicted conf, sized [8732,21].

        Returns:
          boxes: (tensor) bbox locations, sized [#obj, 4].
          labels: (tensor) class labels, sized [#obj,1].
        '''
        variances = [0.1, 0.2]
        wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
        cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
        boxes = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]

        max_conf, labels = conf.max(1)  # [8732,1]
        ids = labels.squeeze(1).nonzero().squeeze(1)  # [#boxes,]

        keep = self.nms(boxes[ids], max_conf[ids].squeeze(1))
        return boxes[ids][keep], labels[ids][keep], max_conf[ids][keep]


In [4]:
class ListDataset(data.Dataset):
    img_size = 300

    def __init__(self, root=None, df=None, train=None, transform=None):
        '''
        Args:
          root: (str) ditectory to images.
          list_file: (str) path to index file.
          train: (boolean) train or test.
          transform: ([transforms]) image transforms.
        '''
        self.root = root
        
        
        self.train = train
        self.transform = transform

        self.fnames = []
        self.boxes = []
        self.labels = []

        self.data_encoder = DataEncoder()
        self.df = df
        self.bbox_cols = ["x", "y","w","h"]
        self.image_list = os.listdir(root) 
        self.num_samples = len(self.image_list)
        

        for img in self.image_list:
            
            self.fnames.append(img)
            split_df = df[df["names"]==img]
            cls_ls, bbox = split_df["class"].values, split_df[self.bbox_cols].values      
            self.boxes.append(torch.Tensor(bbox))
            self.labels.append(torch.LongTensor(cls_ls))

    def __getitem__(self, idx):
        '''Load a image, and encode its bbox locations and class labels.

        Args:
          idx: (int) image index.

        Returns:
          img: (tensor) image tensor.
          loc_target: (tensor) location targets, sized [8732,4].
          conf_target: (tensor) label targets, sized [8732,].
        '''
        # Load image and bbox locations.
        fname = self.fnames[idx]
        img = Image.open(os.path.join(self.root, fname))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx]

        # Data augmentation while training.
        if self.train:
            img, boxes = self.random_flip(img, boxes)
            img, boxes, labels = self.random_crop(img, boxes, labels)

        # Scale bbox locaitons to [0,1].
        w,h = img.size
        boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)

        img = img.resize((self.img_size,self.img_size))
        img = self.transform(img)
        
        
        # Encode loc & conf targets.
        loc_target, conf_target = self.data_encoder.encode(boxes, labels)
#         print(f'box shape: {boxes.shape} lable shape: {labels.shape}')

        return img, loc_target, conf_target

    def random_flip(self, img, boxes):
        '''Randomly flip the image and adjust the bbox locations.

        For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
        (w-xmax, ymin, w-xmin, ymax).

        Args:
          img: (PIL.Image) image.
          boxes: (tensor) bbox locations, sized [#obj, 4].

        Returns:
          img: (PIL.Image) randomly flipped image.
          boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
        '''
        if random.random() < 0.5:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
            w = img.width
            xmin = w - boxes[:,2]
            xmax = w - boxes[:,0]
            boxes[:,0] = xmin
            boxes[:,2] = xmax
        return img, boxes

    def random_crop(self, img, boxes, labels):
        '''Randomly crop the image and adjust the bbox locations.

        For more details, see 'Chapter2.2: Data augmentation' of the paper.

        Args:
          img: (PIL.Image) image.
          boxes: (tensor) bbox locations, sized [#obj, 4].
          labels: (tensor) bbox labels, sized [#obj,].

        Returns:
          img: (PIL.Image) cropped image.
          selected_boxes: (tensor) selected bbox locations.
          labels: (tensor) selected bbox labels.
        '''
        imw, imh = img.size
        while True:
            min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])
            if min_iou is None:
                return img, boxes, labels

            for _ in range(100):
                w = random.randrange(int(0.1*imw), imw)
                h = random.randrange(int(0.1*imh), imh)

                if h > 2*w or w > 2*h:
                    continue

                x = random.randrange(imw - w)
                y = random.randrange(imh - h)
                roi = torch.Tensor([[x, y, x+w, y+h]])

                center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
                roi2 = roi.expand(len(center), 4)  # [N,4]
                mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
                mask = mask[:,0] & mask[:,1]  #[N,]
                if not mask.any():
                    continue

                selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))

                iou = self.data_encoder.iou(selected_boxes, roi)
                if iou.min() < min_iou:
                    continue

                img = img.crop((x, y, x+w, y+h))
                selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
                selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
                selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
                selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
                return img, selected_boxes, labels[mask]

    def __len__(self):
        return self.num_samples


In [5]:
from ssd import SSD300
# from multibox_loss import MultiBoxLoss

from torch.autograd import Variable


use_cuda = torch.cuda.is_available()
best_loss = float('inf')  # best test loss
start_epoch = 0  # start from epoch 0 or last epoch

# Data
print('==> Preparing data..')
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])

trainset = ListDataset(root="./dataset/ShelfImages/train/", df=df, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=4)

testset = ListDataset(root='./dataset/ShelfImages/test/', df = df, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=4)

==> Preparing data..


In [19]:

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F

from torch.autograd import Variable


class MultiBoxLoss(nn.Module):
    num_classes = 2

    def __init__(self):
        super(MultiBoxLoss, self).__init__()
        self.handle_nan_loss = None

    def cross_entropy_loss(self, x, y):
        '''Cross entropy loss w/o averaging across all samples.

        Args:
          x: (tensor) sized [N,D].
          y: (tensor) sized [N,].

        Return:
          (tensor) cross entroy loss, sized [N,].
        '''
        xmax = x.data.max()
        log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
        return log_sum_exp - x.gather(1, y.view(-1,1))

    def test_cross_entropy_loss(self):
        a = Variable(torch.randn(10,4))
        b = Variable(torch.ones(10).long())
        loss = self.cross_entropy_loss(a,b)
#         print(loss.mean())
#         print(F.cross_entropy(a,b))

    def hard_negative_mining(self, conf_loss, pos):
        '''Return negative indices that is 3x the number as postive indices.

        Args:
          conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*1940,].
          pos: (tensor) positive(matched) box indices, sized [N,1940].

        Return:
          (tensor) negative indices, sized [N,1940].
        '''
        batch_size, num_boxes = pos.size()

        conf_loss[pos] = 0  # set pos boxes = 0, the rest are neg conf_loss
        conf_loss = conf_loss.view(batch_size, -1)  # [N,1940]

        _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
        _,rank = idx.sort(1)  # [N,1940]

        num_pos = pos.long().sum(1)  # [N,1]
        num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]

        neg = rank < num_neg.expand_as(rank)  # [N,1940]
        return neg

    def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
        '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).

        Args:
          loc_preds: (tensor) predicted locations, sized [batch_size, 1940, 4].
          loc_targets: (tensor) encoded target locations, sized [batch_size, 1940, 4].
          conf_preds: (tensor) predicted class confidences, sized [batch_size, 1940, num_classes].
          conf_targets: (tensor) encoded target classes, sized [batch_size, 1940].

        loss:
          (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
        '''
        flag = 1
        batch_size, num_boxes, _ = loc_preds.size()

        pos = conf_targets>0  # [N,1940], pos means the box matched.
        num_matched_boxes = pos.data.long().sum()
#         print(f'num of matched boxes: {num_matched_boxes}')

        if num_matched_boxes == 0:
            return self.handle_nan_loss
#         else:

        ################################################################
        # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
        ################################################################
        pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,1940,4]
        pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
        pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
        loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
        loc_loss/= num_matched_boxes
        flag = 1
#         print(f'Location loss: {loc_loss}')
        self.handle_nan_loss = loc_loss
#        
        return loc_loss


In [20]:
net = SSD300()

lr = 1e-3
use_cuda = False

criterion = MultiBoxLoss()

# if use_cuda:
#     net = torch.nn.DataParallel(net, device_ids=[0,1,2,3,4,5,6,7])
#     net.cuda()
#     cudnn.benchmark = True

optimizer = optim.SGD(net.parameters(), 
                      lr=lr,
                      momentum=0.9,
                      weight_decay=1e-4
                     )

In [21]:
net.train()
train_loss = 0
for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
    if use_cuda:
        images = images.cuda()
        loc_targets = loc_targets.cuda()
        conf_targets = conf_targets.cuda()
        
    
    images = Variable(images)
    
    loc_targets = Variable(loc_targets)
    
    conf_targets = Variable(conf_targets)
        
    

    optimizer.zero_grad()
    
    loc_preds, conf_preds = net(images)
    
#     try:

    loss = Variable(criterion(loc_preds, loc_targets, 
                                  conf_preds, conf_targets),requires_grad =True)
    loss.backward()

    optimizer.step()

    train_loss += loss.data
    print('%.3f %.3f' % (loss.data, train_loss/(batch_idx+1)))

#     except:
#         print('Fix the loss function')


torch.Size([4, 512, 38, 38])
9.823 9.823
torch.Size([4, 512, 38, 38])
9.906 9.864
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
10.955 nan
torch.Size([4, 512, 38, 38])
9.916 nan
torch.Size([4, 512, 38, 38])
9.527 nan
torch.Size([4, 512, 38, 38])
9.560 nan
torch.Size([4, 512, 38, 38])
9.887 nan
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
6.556 nan
torch.Size([4, 512, 38, 38])
10.405 nan
torch.Size([4, 512, 38, 38])
9.889 nan
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
10.047 nan
torch.Size([4, 512, 38, 38])
8.675 nan
torch.Size([4, 512, 38, 38])
7.908 nan
torch.Size([4, 512, 38, 38])
9.224 nan
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
3.124 nan
torch.Size([4, 512, 38, 38])
9.460 nan
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
10.228 nan
torch.Size([4, 512, 38, 38])
9.279 nan
torch.Size([4, 512, 38, 38])
nan nan
torch.Size([4, 512, 38, 38])
8.463 nan
torch.Size([4, 512, 38, 38])


KeyboardInterrupt: 

NameError: name 'nan' is not defined