In [None]:
# Run this cell before the lab !
# It will download PascalVOC dataset (400Mo) and 
# pre-computed representations of images (450Mo)

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os.path as op

import tarfile
try:
    from urllib.request import urlretrieve
except ImportError:  # Python 2 compat
    from urllib import urlretrieve


URL_VOC = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar"
FILE_VOC = "VOCtrainval_06-Nov-2007.tar"
FOLDER_VOC = "VOCdevkit"

if not op.exists(FILE_VOC):
    print('Downloading from %s to %s...' % (URL_VOC, FILE_VOC))
    urlretrieve(URL_VOC, './' + FILE_VOC)

if not op.exists(FOLDER_VOC):
    print('Extracting %s...' % FILE_VOC)
    tar = tarfile.open(FILE_VOC)
    tar.extractall()
    tar.close()

URL_REPRESENTATIONS = "https://github.com/m2dsupsdlclass/lectures-labs/releases/download/0.1/voc_representaions.h5"
FILE_REPRESENTATIONS = "voc_representaions.h5"

if not op.exists(FILE_REPRESENTATIONS):
    print('Downloading from %s to %s...' % (URL_REPRESENTATIONS, FILE_REPRESENTATIONS))
    urlretrieve(URL_REPRESENTATIONS, './' + FILE_REPRESENTATIONS)

# Classification and Localisation model

The objective is to build and train a classification and localisation network. This exercise will showcase the flexibility of Deep Learning with several, heterogenous outputs (bounding boxes and classes)

The model is in two parts:
- Representations from pre-trained ResNet50 network `shape = (7, 7, 2048)`
- A simplified model which outputs 
  - classes (5 classes)
  - bounding box coordinates

## Loading images and annotations

We will be using pascalVOC 2007, a dataset widely used in detection and segmentation http://host.robots.ox.ac.uk/pascal/VOC/ To lower memory footprint and training time, we'll only use 5 classes: "dog", "cat", "bus", "car", "aeroplane". Here are the first steps:
- Load the annotations file from pascalVOC and parse it (xml file)
- keep only the annotations we're interested in, and containing a single object
- Pre-compute ResNet conv5c from the corresponding images


In [None]:
import numpy as np
from lxml import etree
import os

# Parse the xml annotation file and retrieve the path to image, its size and annotations
def extract_xml_annotation(filename):
    z = etree.parse(filename)
    objects = z.findall("/object")
    size = (int(z.find("//width").text), int(z.find("//height").text))
    fname = z.find("/filename").text
    dics = [{obj.find("name").text:[int(obj.find("bndbox/xmin").text), 
                                    int(obj.find("bndbox/ymin").text), 
                                    int(obj.find("bndbox/xmax").text), 
                                    int(obj.find("bndbox/ymax").text)]} 
            for obj in objects]
    output = {"size": size, "filename":fname, "objects":dics}
    return output

In [None]:
# Filters annotations keeping only those we are interested in
# We only keep images in which there is a single item
annotations = []

filters = ["dog", "cat", "bus", "car", "aeroplane"]
idx2labels = {k:v for (k,v) in enumerate(filters)}
labels2idx = {v:k for k,v in idx2labels.items()}

for file in os.listdir("VOCdevkit/VOC2007/Annotations/"):
    annotation = extract_xml_annotation("VOCdevkit/VOC2007/Annotations/" +file)
    new_objects = []
    for obj in annotation["objects"]:
        # keep only labels we're interested in
        if list(obj.keys())[0] in filters:
            new_objects.append(obj)
    # Keep only if there's a single object in the image
    if len(new_objects)==1:
        annotation["class"] = list(new_objects[0].keys())[0]
        annotation["bbox"] = list(new_objects[0].values())[0]
        annotation.pop("objects")
        annotations.append(annotation)

In [None]:
print("number of images/annotations:", len(annotations))
print("\nexample annotation[0]:\n", annotations[0])
print("\ncorrespondance between indexes and labels:", idx2labels)

## Pre-computing representations

Load a headless pre-trained ResNet50:
- loading a headless ResNet from Keras and removing the AveragePooling layer 

In [None]:
from keras.applications.resnet50 import ResNet50
from keras.models import Model

# TODO

In [None]:
# %load solutions/load_pretrained.py
model = ResNet50(include_top=False)
input = model.layers[0].input

# Remove the average pooling layer
output = model.layers[-2].output
headless_conv = Model(input = input, output = output)


### predict on a batch of images

The `predict_batch` function is defined as follows:
- open each image, and resize them to `img_size`
- stack them as a batch tensor of shape `(batch, img_size_x, img_size_y, 3)`
- preprocess the batch and make a forward pass with the model

In [None]:
from scipy.misc import imread, imresize
from keras.applications.imagenet_utils import preprocess_input

def predict_batch(model, img_batch_path, img_size=None):
    img_list = []

    for im_path in img_batch_path:
        img = imread(im_path)
        if img_size:
            img = imresize(img,img_size)

        img = img.astype('float32')
        img_list.append(img)
    try:
        img_batch = np.stack(img_list, axis=0)
    except:
        raise ValueError('when img_size and crop_size are None, images'
                ' in image_paths must have the same shapes.')

    batch = preprocess_input(img_batch)
    return model.predict(x = img_batch)


In [None]:
#test our model
output = predict_batch(headless_conv, ["dog.jpg"], (1000, 224))
print("output shape", output.shape)

# The output size is (batch_size, 1000/32 = 32, 224/32 = 7, 2048)

### Compute representations on all images in our annotations

Computing representations for all images may take some time, so it was pre-computed and save in `voc_representaions.h5`

This was achieved through the `compute_representations.py` script, you're welcome to use it if needed.

Otherwise, load the pre-trained representations in h5 format using the following:

In [None]:
import h5py

# Load pre-calculated representations
h5f = h5py.File('voc_representaions.h5','r')
reprs = h5f['reprs'][:]
h5f.close()

## Building ground truth from annotation

We cannot use directly the annotation dictionnary as ground truth in our model. 

We will build the `y_true` tensor that will be compared to the output of the model

#### boxes
- The image is resized to a fixed 224x224, so need to be the boxes coordinates
- We have to convert the top-left and bottom-right coordinates `(x1, y1, x2, y2)` to center, height, width `(xc,yc,w,h)`

#### classes
- the class labels are mapped to corresponding indexes

In [None]:
img_resize = 224
num_classes = len(labels2idx.keys())

def box_center(x,y):
    return (box_size/2 + x * box_size, box_size/2 + y * box_size)

def convert_to_ground_truth(annotations):
    all_boxes = []
    all_cls = []
    for idx, annotation in enumerate(annotations):
        # Build a one-hot encoding of the class
        cls = np.zeros((num_classes))
        cls_idx = labels2idx[annotation["class"]]
        cls[cls_idx] = 1.0
        
        coords = annotation["bbox"]
        size = annotation["size"]
        # resize the image
        x1, y1, x2, y2 = (coords[0]*img_resize/size[0], coords[1]*img_resize/size[1], 
                          coords[2]*img_resize/size[0], coords[3]*img_resize/size[1])
        
        #compute center height and width
        cx, cy = ((x2 + x1)/2, (y2 + y1)/2)
        w = x2 - x1
        h = y2 - y1
        boxes = np.array([cx, cy, w, h])
        all_boxes.append(boxes)
        all_cls.append(cls)
    # stack everything into two big np tensors
    return np.stack(all_cls, axis=0), np.stack(all_boxes, axis=0)

In [None]:
classes, boxes = convert_to_ground_truth(annotations)

In [None]:
print("classes and boxes shapes:", classes.shape, boxes.shape)
print("\n classes examples:\n" , classes[0:2])
print("\n boxes examples:\n" , boxes[0:2])

### Interpreting output of model

Interpreting the output of the model is going from the output tensors to a set of classes (with confidence) and boxes coordinates. It corresponds to reverting the previous process.


In [None]:
def interpret_output(cls, boxes, img_size=(500,333)):
    cls_idx = np.argmax(cls)
    confidence = cls[cls_idx]
    classname = idx2labels[cls_idx]
    cx, cy = boxes[0], boxes[1]
    w, h = boxes[2], boxes[3]
    
    small_box = [max(0, cx - w/2), max(0, cy - h/2), 
                 min(img_resize, cx + w/2), min(img_resize, cy + h/2)]
    
    fullsize_box = [int(small_box[0] * img_size[0] / img_resize), 
                    int(small_box[1] * img_size[1] / img_resize),
                    int(small_box[2] * img_size[0] / img_resize), 
                    int(small_box[3] * img_size[1] / img_resize)]
    output = {"class": classname, "confidence":confidence, "bbox": fullsize_box}
    return output
    

In [None]:
# sanity check: interpret the classes and boxes tensors

print(annotations[1])
output = interpret_output(classes[1], boxes[1], img_size=annotations[1]["size"])
print(output)

### Intersection over Union

In order to assess the quality of our model, we will monitor the IoU between ground truth box and predicted box. 
The following function computes the IoU:

In [None]:
def iou(boxA, boxB):
    # find the intersecting box coordinates
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
 
    # compute the area of intersection rectangle
    interArea = (xB - xA + 1) * (yB - yA + 1)
 
    # compute the area of each box
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
 
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of areas - the interesection area 
    return interArea / float(boxAArea + boxBArea - interArea)
    

In [None]:
# Sanity check
print("iou:",iou(annotations[1]["bbox"], output["bbox"]))

### Classification and Localisation model

A two headed model for classification and localisation

In [None]:
import keras.backend as K

def masked_mse(y_true, y_pred):
    masks = K.not_equal(y_true, 0.)
    return K.mean(K.square(y_pred - y_true) * K.cast(masks, "float32"), axis=-1)

In [None]:
from keras.objectives import mean_squared_error, binary_crossentropy
from keras.layers import Input, Convolution2D, Dropout, GlobalAveragePooling2D, Flatten, Dense, GlobalMaxPooling2D
from keras.models import Model

def classif_and_loc_stupid(num_classes):
    model_input = Input(shape=(7,7,2048))
    
    # rather stupid version: we average all spatial information
    x = GlobalAveragePooling2D()(model_input)
    x = Dropout(0.2)(x)
    head_classes = Dense(num_classes, activation="softmax", name="head_classes")(x)
    head_boxes = Dense(4, name="head_boxes")(x)
    
    model = Model(model_input, output = [head_classes, head_boxes], name="resnet_loc")
    model.compile(optimizer="adam", loss=[binary_crossentropy, "mse"], 
                  loss_weights=[1., 0.01]) 
    return model

In [None]:
model = classif_and_loc_stupid(5)

In [None]:
# Debugging the model: select a few examples 
# and test the model (before training)
# Check that shapes correspond to ground truth!

num = 64
inputs = reprs[0:num]
out_cls, out_boxes = classes[0:num], boxes[0:num]
print("input batch shape:", inputs.shape, "\nground truth batch shapes:", out_cls.shape, out_boxes.shape)
out = model.predict(x=inputs)
print("\nmodel output shapes:", out[0].shape, out[1].shape)

In [None]:
# Debugging the model: check whether the loss decreases, 
# and eventually if we are able to overfit on these few examples
history = model.fit(x = inputs, y=[out_cls, out_boxes], batch_size=10, nb_epoch=10)

In [None]:
import matplotlib.pyplot as plt
plt.plot(np.log(history.history["head_boxes_loss"]))
plt.plot(np.log(history.history["head_classes_loss"]))
plt.plot(np.log(history.history["loss"]))
plt.show();

### Displaying images and bounding box

In order to display our annotations, we build the function `plot_annotations` as follows:
- display the image
- display on top annotations and ground truth bounding boxes and classes

The `display` function:
- takes a single index and computes the result of the model
- interpret the output of the model as a bounding box
- calls the `plot_annotations` function

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def patch(axis, bbox, display_txt, color):
    coords = (bbox[0], bbox[1]), bbox[2]-bbox[0]+1, bbox[3]-bbox[1]+1
    axis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
    axis.text(bbox[0], bbox[1], display_txt, bbox={'facecolor':color, 'alpha':0.5})
    
def plot_annotations(img_path, annotation=None, ground_truth=None):
    img = imread(img_path)
    plt.imshow(img)
    currentAxis = plt.gca()
    if ground_truth:
        text = "gt " + ground_truth["class"]
        patch(currentAxis, ground_truth["bbox"], text, "red")
    if annotation:
        conf = '{:0.2f} '.format(annotation['confidence'])
        text = conf + annotation["class"]
        patch(currentAxis, annotation["bbox"], text, "blue")
    plt.axis('off')
    plt.show()

def display(index, ground_truth=True):
    res = model.predict(reprs[index][np.newaxis,])
    output = interpret_output(res[0][0], res[1][0], img_size=annotations[index]["size"])
    plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     output, annotations[index] if ground_truth else None)

In [None]:
# Display one of the image on which we trained the model
display(13)

In [None]:
# Display an image we didn't train the model on
display(194)

### Computing Accuracy

for each example `(class_true, bbox_true)`, we consider it positive if and only if:
- the argmax of `output_class` of the model is `class_true`
- the IoU between the `output_bbox` and the `bbox_true` is above a threshold (usually `0.5`)

The accuracy of a model is then number of positive / total_number

The following functions compute the class accuracy, iou average and global accuracy:

In [None]:
# Compute class accuracy, iou average and global accuracy
def accuracy_and_iou(preds, trues, threshold = 0.5):
    sum_valid, sum_accurate, sum_iou = 0, 0, 0
    num = len(preds)
    for pred, true in zip(preds, trues):
        iou_value = iou(pred["bbox"], true["bbox"])
        if pred["class"]==true["class"] and iou_value > threshold:
            sum_valid = sum_valid + 1
        sum_iou = sum_iou + iou_value
        if pred["class"]==true["class"]:
            sum_accurate = sum_accurate + 1
    return sum_accurate/num, sum_iou/num, sum_valid/num

In [None]:
# Compute the previous function on the whole train / test set
def compute_acc(train=True):
    if train:
        beg, end = 0, (9 * len(annotations))
        txt = "train"
    else:
        beg, end = (9 * len(annotations)) // 10, len(annotations) 
        txt = "test"
    res = model.predict(reprs[beg:end])
    outputs = []
    for index, (classes, boxes) in enumerate(zip(res[0], res[1])):
        output = interpret_output(classes, boxes, img_size=annotations[index]["size"])
        outputs.append(output)
    
    acc, iou, valid = accuracy_and_iou(outputs, annotations[beg:end], threshold=0.5)
    
    print(txt + ' acc: {:0.3f}, mean iou: {:0.3f}, acc_valid: {:0.3f}'.format(acc, iou, valid) )

In [None]:
compute_acc(train=True)
compute_acc(train=False)

### Training on the whole dataset

We split our dataset into a train and a test dataset

Then train the model on the whole training set

In [None]:
#Keep last examples for test
test_num = reprs.shape[0] // 10
train_num = reprs.shape[0] - test_num
test_inputs = reprs[train_num:]
test_cls, test_boxes = classes[train_num:], boxes[train_num:]
print(train_num)

In [None]:
batch_size = 32
inputs = reprs[0:train_num]
out_cls, out_boxes = classes[0:train_num], boxes[0:train_num]

history = model.fit(x = inputs, y=[out_cls, out_boxes], 
                    validation_data=(test_inputs, [test_cls, test_boxes]), 
                    batch_size=batch_size, nb_epoch=10)

In [None]:
compute_acc(train=True)
compute_acc(train=False)

### Build a better model

**Exercise**
Use any tool at your disposal to build a better model:
- Dropout
- Convolution2D, Dense, with activations functions
- Flatten, GlobalAveragePooling2D, GlobalMaxPooling2D, etc.

Notes:
- Be careful not to add too parametrized layers as you only have ~1200 training samples
- Feel free to modify hyperparameters: learning rate, optimizers, loss_weights

**Bonus**
- Add data augmentation: 
  - flip images
  - add random crops before resizing

In [None]:
def classif_and_loc(num_classes):
    model_input = Input(shape=(7,7,2048))
    
    # TODO
    
    model = Model(model_input, output = [head_classes, head_boxes], name="resnet_loc")
    model.compile(optimizer="adam", loss=[binary_crossentropy, "mse"], 
                  loss_weights=[1., 1 / (224*224)]) 
    return model

In [None]:
# Build, train and compute accuracy
model = classif_and_loc(5)

history = model.fit(x = inputs, y=[out_cls, out_boxes], 
                    validation_data=(test_inputs, [test_cls, test_boxes]), 
                    batch_size=batch_size, nb_epoch=30)

compute_acc(train=True)
compute_acc(train=False)

In [None]:
# %load solutions/classif_and_loc.py