# Single-shot detection model

The objective is to build and train a localisation network. This exercise will showcase the flexibility of Deep Learning with several, heterogenous outputs (bounding boxes and classes)

The model is in two parts:
- Representations from pre-trained ResNet50 network `shape = (7, 7, 2048)`
- A simplified ssd (single shot detection) model which outputs 
  - classes (dogs / cats / background)
  - bounding box coordinates

## Loading images and annotations

We will be using pascalVOC 2007, a dataset widely used in detection and segmentation http://host.robots.ox.ac.uk/pascal/VOC/ To lower memory footprint and training time, we'll only use 2 classes: cat and dog. Here are the first steps:
- Download the pascalVOC in the present folder
- Load the annotations file from pascalVOC and parse it (xml file), keeping only cats and dogs
- Pre-compute resnet representations from the corresponding images


In [None]:
import numpy as np
from lxml import etree
import os

# Parse the xml annotation file and retrieve the path to image, its size and annotations
def extract_xml_annotation(filename):
    z = etree.parse(filename)
    objects = z.findall("/object")
    size = (int(z.find("//width").text), int(z.find("//height").text))
    fname = z.find("/filename").text
    dics = [{obj.find("name").text:[int(obj.find("bndbox/xmin").text), 
                                    int(obj.find("bndbox/ymin").text), 
                                    int(obj.find("bndbox/xmax").text), 
                                    int(obj.find("bndbox/ymax").text)]} 
            for obj in objects]
    output = {"size": size, "filename": fname, "objects": dics}
    return output

In [None]:
# Filters annotations keeping only those we are interested in
annotations = []

filters = ["dog", "cat"]
for file in os.listdir("VOCdevkit/VOC2007/Annotations/"):
    annotation = extract_xml_annotation("VOCdevkit/VOC2007/Annotations/" +file)
    new_objects = []
    for obj in annotation["objects"]:
        if list(obj.keys())[0] in filters:
            new_objects.append(obj)
    if len(new_objects)>0:
        annotation["objects"] = new_objects
        annotations.append(annotation)

In [None]:
# Filters annotations keeping only those we are interested in
annotations2 = []

filters = ["dog", "cat", "bus", "car", "aeroplane"]
for file in os.listdir("VOCdevkit/VOC2007/Annotations/"):
    annotation = extract_xml_annotation("VOCdevkit/VOC2007/Annotations/" +file)
    new_objects = []
    for obj in annotation["objects"]:
        if list(obj.keys())[0] in filters:
            new_objects.append(obj)
    if len(new_objects)==1:
        annotation["class"] = list(new_objects[0].keys())[0]
        annotation["bbox"] = list(new_objects[0].values())[0]
        annotation.pop("objects")
        annotations2.append(annotation)

In [None]:
print(len(annotations2))
print(annotations2[0])

## Pre-computing representations

Load a headless pre-trained ResNet50. There are a few ways you can do it:
- using the previous ResNet_fc and removing the last two layers (Convolution and Softmax)
- loading a headless ResNet from Keras and removing the AveragePooling layer 

In [None]:
from scipy.misc import imread, imresize

def predict_batch(model, img_batch_path, img_size=None):
    img_list = []

    for im_path in img_batch_path:
        img = imread(im_path)
        if img_size:
            img = imresize(img,img_size)

        img = img.astype('float32')
        img_list.append(img)
    try:
        img_batch = np.stack(img_list, axis=0)
    except:
        raise ValueError('when img_size and crop_size are None, images'
                ' in image_paths must have the same shapes.')

    batch = preprocess_input(img_batch)
    return model.predict(x = img_batch)


In [None]:
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.applications.imagenet_utils import preprocess_input

model = ResNet50(include_top=False)
input = model.layers[0].input

# Remove the average pooling layer!
output = model.layers[-2].output
headless_conv = Model(input = input, output = output)

In [None]:
#test the model
output = predict_batch(headless_conv, ["dog.jpg"], (1000, 224))
output.shape

### compute representations on all images in our annotations

In [None]:
def compute_representations(annotations):
    # Computing representations

    batch_size = 32
    batches = []

    for a_idx in range(len(annotations)//32+1):
        batch_bgn = a_idx*32
        batch_end = min(len(annotations), (a_idx+1)*32)
        img_names = []
        for annotation in annotations[batch_bgn:batch_end]:
            img_names.append("VOCdevkit/VOC2007/JPEGImages/" + annotation["filename"])
        batch = predict_batch(headless_conv, img_names, (224, 224))
        batches.append(batch)
        print("batch " +str(a_idx) + " prepared") 
    return np.vstack(batches)

In [None]:
# Computes representations (warning this may take some time!)
# reprs = compute_representations(annotations)
#import h5py

# Serialize representations
#h5f = h5py.File('representaions.h5', 'w')
#h5f.create_dataset('reprs', data=reprs)
#h5f.close()

### Loading serialized representations

- The representations won't be fine-tuned, so we may save them so that we won't have to recompute them each time
- to retrieve large data files, we h5 compressed file format, using h5py as below:

In [None]:
import h5py

# Load pre-calculated representations
h5f = h5py.File('representaions.h5','r')
reprs = h5f['reprs'][:]
h5f.close()

## Building ground truth from annotation

Our goal is to build the `y_true` tensor that will be compared to the output of the model
- the image is resized to a fixed 224x224, so need to be the boxes coordinates
- What are the output sizes of the model, for such a size of input image?

#### classes
- Convert an annotation format to tensor for classes:
 - each annotated object will be mapped to a single position in the `(7, 7)` grid
 - the class labels are mapped to `'background':0, 'cat': 1, 'dog': 2`
 
#### boxes
- Convert an annotation format to tensor for boxes:
 - each annotated object has a default box around the position of the object on the `(7, 7)` grid
 - the coordinates of the box represent the following:
   - horizontal offset of center (between the default box and the ground truth box)
   - vertical offset of center (between the default box and the ground truth box)
   - difference of width (between the default box 32 and the ground truth box)
   - difference of height (between the default box 32 and the ground truth box)
 

In [None]:
label2idx = {'cat': 1, 'dog': 2}
idx2label = {v:k for k,v in label2idx.items()}

In [None]:
img_resize = 224
grid_size = 7
box_size = img_resize // grid_size

def box_center(x,y):
    return (box_size/2 + x * box_size, box_size/2 + y * box_size)

def convert_to_ground_truth(annotations):
    all_boxes = []
    all_cls = []
    for idx, annotation in enumerate(annotations):
        cls = np.zeros((grid_size,grid_size,3))
        boxes = np.zeros((grid_size,grid_size,4))
        size = annotation["size"]
        objects = annotation["objects"]
        for obj in objects: 
            for k, v in obj.items():
                lbl = label2idx[k]
                x1,y1,x2,y2 = (v[0]*img_resize/size[0], v[1]*img_resize/size[1], 
                               v[2]*img_resize/size[0], v[3]*img_resize/size[1])   
                c = ((x2 + x1)/2, (y2 + y1)/2)
                w = x2 - x1
                h = y2 - y1
                cx_id = int(c[0] / box_size)
                cy_id = int(c[1] / box_size)
                cls[cx_id,cy_id,lbl] = 1.0
                b_center = box_center(cx_id, cy_id)
                boxes[cx_id,cy_id,0] = (c[0] - b_center[0])/32.
                boxes[cx_id,cy_id,1] = (c[1] - b_center[1])/32.
                boxes[cx_id,cy_id,2] = 1 - w/32.
                boxes[cx_id,cy_id,3] = 1 - h/32.
        all_boxes.append(boxes)
        all_cls.append(cls)
    return np.stack(all_cls, axis=0), np.stack(all_boxes, axis=0)

In [None]:
img_resize = 224
grid_size = 7
box_size = img_resize // grid_size
dim_output = 7 # number of bounding box * (4+1) + number of classes
label2idx = {'cat': 0, 'dog': 1}
idx2label = {v:k for k,v in label2idx.items()}

def box_center(x,y):
    return (box_size/2 + x * box_size, box_size/2 + y * box_size)

def convert_to_yolo_ground_truth(annotations):
    all_outputs = []
    
    for idx, annotation in enumerate(annotations):
        output = np.zeros((grid_size,grid_size,dim_output))
        size = annotation["size"]
        objects = annotation["objects"]
        for obj in objects: 
            for k, v in obj.items():
                lbl = label2idx[k]
                
                # normalized coordinates of gt box
                x1,y1,x2,y2 = (v[0]*img_resize/size[0], v[1]*img_resize/size[1], 
                               v[2]*img_resize/size[0], v[3]*img_resize/size[1])   
                
                # center
                c = ((x2 + x1)/2, (y2 + y1)/2)
                w = x2 - x1
                h = y2 - y1
                
                # select most probable grid box
                cx_id = int(c[0] / box_size)
                cy_id = int(c[1] / box_size)
                
                # assign confidence for that box and label
                output[cx_id, cy_id, 4] = 1.0
                output[cx_id, cy_id, 5 + lbl] = 1.0
                
                # center and width
                b_center = box_center(cx_id, cy_id)
                output[cx_id,cy_id,0] = (c[0] - b_center[0])/32.
                output[cx_id,cy_id,1] = (c[1] - b_center[1])/32.
                output[cx_id,cy_id,2] = 1 - w/32.
                output[cx_id,cy_id,3] = 1 - h/32.
        all_outputs.append(output)
    return np.stack(all_outputs, axis=0)

In [None]:
yolo_gt = convert_to_ground_truth(annotations)

In [None]:
classes, boxes = convert_to_yolo_ground_truth(annotations)

In [None]:
print("classes and boxes shapes:", classes.shape, boxes.shape)
print("classes and boxes shapes:", yolo_gt.shape)


### Interpreting output of model

Interpreting the output of the model is going from the output tensors to a set of classes (with confidence) and boxes coordinates. It corresponds to reverting the previous process.


In [None]:
def interpret_output(cls, boxes, threshold=0.7, img_size=(500,333)):
    idx_positive = np.nonzero(cls > threshold)
    output = []
    for idx in range(len(idx_positive[0])):
        x = idx_positive[0][idx]
        y = idx_positive[1][idx]
        classname = idx_positive[2][idx]
        if classname==0:
            continue
        boxes_raw_cx = x * box_size + box_size/2
        boxes_raw_cy = y * box_size + box_size/2
        cx = boxes_raw_cx + boxes[x,y,0] * box_size
        cy = boxes_raw_cy + boxes[x,y,1] * box_size
        w = box_size * (1 - boxes[x,y,2])
        h = box_size * (1 - boxes[x,y,3])
        small_box = [max(0, cx - w/2), max(0, cy - h/2), 
                     min(img_resize, cx + w/2), min(img_resize, cy + h/2)]
        fullsize_box = [int(small_box[0] * img_size[0] / img_resize), int(small_box[1] * img_size[1] / img_resize),
                        int(small_box[2] * img_size[0] / img_resize), int(small_box[3] * img_size[1] / img_resize)]
        #todo check bounds in image
        output.append({idx2label[classname]: fullsize_box, "confidence":cls[x,y, classname]})
    return output
    

In [None]:
def interpret_yolo_output(yolo_output, threshold=0.7, img_size=(500,333)):
    proba_classes = np.multiply(yolo_output[:,:,5:], yolo_output[:,:,4:5])
    idx_positive = np.nonzero(proba_classes > threshold)
    output = []
    for idx in range(len(idx_positive[0])):
        x = idx_positive[0][idx]
        y = idx_positive[1][idx]
        classname = np.argmax(proba_classes[x,y])
        confidence = np.max(proba_classes[x,y])
        boxes_raw_cx = x * box_size + box_size/2
        boxes_raw_cy = y * box_size + box_size/2
        cx = boxes_raw_cx + yolo_output[x,y,0] * box_size
        cy = boxes_raw_cy + yolo_output[x,y,1] * box_size
        w = box_size * (1 - yolo_output[x,y,2])
        h = box_size * (1 - yolo_output[x,y,3])
        small_box = [max(0, cx - w/2), max(0, cy - h/2), 
                     min(img_resize, cx + w/2), min(img_resize, cy + h/2)]
        fullsize_box = [int(small_box[0] * img_size[0] / img_resize), int(small_box[1] * img_size[1] / img_resize),
                        int(small_box[2] * img_size[0] / img_resize), int(small_box[3] * img_size[1] / img_resize)]
        #todo check bounds in image
        output.append({idx2label[classname]: fullsize_box, "confidence":confidence})
    return output
    

In [None]:
# sanity check: convert annotation to ground truth, then interpret the tensors
print(annotations[1])
output = interpret_yolo_output(yolo_gt[1], threshold=0.7, img_size=annotations[0]["size"])
print(output)
print("iou: "+str(iou(annotations[1]["objects"][0]["dog"], output[0]["dog"])))
match_pred_and_gt(output, annotations[1])

In [None]:
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
 
    # compute the area of intersection rectangle
    interArea = (xB - xA + 1) * (yB - yA + 1)
 
    # compute the area of each box
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
 
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of areas - the interesection area 
    return interArea / float(boxAArea + boxBArea - interArea)
    
    
def match_iou(listA, listB):
    y_indices = []
    for ix,x in enumerate(listA):
        values = [iou(x,y) for iy, y in enumerate(listB) if iy not in y_indices]
        if len(values) == 0:
            continue
        y_indices.append(values.index(max(values)))
    pairs = list(zip(range(len(listA)), y_indices))
    return pairs

def match_pred_and_gt(prediction, ground_truth):
    dic = {}
    for obj in ground_truth["objects"]:        
        for k, v in obj.items():
            if k not in dic:
                dic[k] = ([v], [])
            else:
                dic[k][0].append(v)
    for obj in prediction:
        for k, v in obj.items():
            if k!="confidence":
                if k not in dic:
                    dic[k] = ([], [v])
                else:
                    dic[k][1].append(v)
    
    final_dic = {}
    for k,v in dic.items():
        pairs = match_iou(v[0], v[1])
        final_dic[k] = (pairs, [iou(v[0][x],v[1][y]) for x,y in pairs])
    return final_dic

In [None]:
# sanity check: convert annotation to ground truth, then interpret the tensors
print(annotations[1])
output = interpret_output(classes[0], boxes[0], threshold=0.7, img_size=annotations[0]["size"])
print(output)
print("iou: "+str(iou(annotations[0]["objects"][0]["dog"], output[0]["dog"])))
match_pred_and_gt(output, annotations[0])

### Single-shot model

A very straightforward single-shot detection model, much alike YOLO

In [None]:
import keras.backend as K

def masked_mse(y_true, y_pred):
    masks = K.not_equal(y_true, 0.)
    return K.mean(K.square(y_pred - y_true) * K.cast(masks, "float32"), axis=-1)

In [None]:
import keras.backend as K

def yolo_loss(y_true, y_pred):
    # object
    obj = K.cast(K.not_equal(y_true[:,:,:,4:5], 0.), "float32")
    
    # no_object
    noobj = K.cast(K.not_equal(y_true[:,:,:,4:5], 1.), "float32")
    
    bbox_loss = K.mean(K.square(y_pred[:,:,:,0:3] - y_true[:,:,:,0:3]) * obj, axis=-1)
    conf_loss = K.mean(K.square(y_pred[:,:,:,4:5] - y_true[:,:,:,4:5]) * obj, axis=-1)
    conf_loss_noobj =  K.mean(K.square(y_pred[:,:,:,4:5] - y_true[:,:,:,4:5]) * noobj, axis=-1)
    class_loss = K.mean(K.square(y_pred[:,:,:,5:] - y_true[:,:,:,5:]) * obj, axis=-1)
    
    return bbox_loss * 5 + conf_loss + conf_loss_noobj * 0.5 + class_loss

In [None]:
def iou_K(coords, coords_pred):
    # predictions
    centers = coords[:,:,:,0:2]
    wh = coords[:,:,:,2:4]
    upleft = centers - (wh * .5) # [batch, S, S, 2]
    botright  = centers + (wh * .5) # [batch, S, S, 2]
    area_pred = wh[:,:,:,0:1] * wh[:,:,:,1:2]
    
    # true
    true_centers = coords[:,:,:,0:2]
    true_wh = coords[:,:,:,2:4]
    true_upleft = centers - (wh * .5) # [batch, S, S, 2]
    true_botright  = centers + (wh * .5) # [batch, S, S, 2]
    true_area = wh[:,:,:,0:1] * wh[:,:,:,1:2]
    
    # calculate the intersection areas
    intersect_upleft   = K.maximum(upleft, true_upleft) 
    intersect_botright = K.minimum(botright , true_botright)
    intersect_wh = intersect_botright - intersect_upleft
    intersect_wh = K.maximum(intersect_wh, 0.0)
    intersect = intersect_wh[:,:,:,0:1] * intersect_wh[:,:,:,1:2]
    
    # calculate the best IOU, set 0.0 confidence for worse boxes
    iou = intersect / (true_area + area_pred - intersect)
    best_box = K.equal(iou, K.max(iou, axis=2, keepdims=True))
    best_box = K.cast(best_box, "float32")
    return best_box

def yolo_loss(y_true, y_pred):
    
    # Extract the coordinate prediction from net.out
    coords = y_pred[:, :, :, 0:4]
    conf = y_pred[:, :, :, 4:5]
    classes = y_pred[:, :, :, 5:]

    true_coords = y_pred[:, :, :, 0:4]
    true_conf = y_pred[:, :, :, 4:5]
    true_classes = y_pred[:, :, :, 5:]
    
    best_box = iou_K(coords, true_coords)
    noobj_box = (1. - best_box) # no object
    
    bbox_loss = K.mean(K.square(y_pred[:,:,:,0:4] - y_true[:,:,:,0:4]) * best_box, axis=-1)
    conf_loss = K.mean(K.square(y_pred[:,:,:,4:5] - y_true[:,:,:,4:5]) * best_box, axis=-1)
    conf_loss_noobj =  K.mean(K.square(y_pred[:,:,:,4:5] - y_true[:,:,:,4:5]) * noobj_box, axis=-1)
    class_loss = K.mean(K.square(y_pred[:,:,:,5:] - y_true[:,:,:,5:]) * best_box, axis=-1)
    
    return bbox_loss * 5 + conf_loss + conf_loss_noobj * 0.1 + class_loss

In [None]:
from keras.objectives import mean_squared_error, binary_crossentropy
from keras.layers import Input, Convolution2D, Dropout
from keras.models import Model
from keras.optimizers import RMSprop

def ssd_model(num_classes, num_boxes):
    model_input = Input(shape=(None,None,2048))
    x = Dropout(0.3)(model_input)
    head_classes = Convolution2D(num_classes, 1, 1, activation='sigmoid', name='classes')(x)
    #head_classes = Convolution2D(num_classes, 1, 1, name='classes')(model_input)
    #head_classes = SoftmaxMap(axis=-1)(head_classes)
    
    head_boxes = Convolution2D(4*num_boxes, 1, 1, name='boxes')(x)
    
    model = Model(model_input, output = [head_classes, head_boxes], name="resnet_ssd")
    model.compile(optimizer="adam", loss=[binary_crossentropy, masked_mse], 
                  loss_weights=[1., 0.4]) 
    return model

In [None]:
def yolo(num_boxes, num_classes):
    model_input = Input(shape=(None,None,2048))
    x = Convolution2D(128, 1, 1, activation='relu', name='conv_hidden')(model_input)
    #x = Dropout(0.3)(x)
    output = Convolution2D(5*num_boxes + num_classes, 1, 1, name='boxes')(x)
    
    model = Model(model_input, output = output, name="resnet_ssd")
    rms = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=rms, loss=yolo_loss) 
    return model

In [None]:
model = yolo(1, 2)

In [None]:
model = ssd_model(3, 1)

In [None]:
num = 64
batch_size = 10
inputs = reprs[0:num]
outputs = yolo_gt[0:num]
print(inputs.shape, outputs.shape)
out = model.predict(x=inputs)
print(out.shape)
model.fit(inputs, outputs, batch_size=batch_size, nb_epoch=50)


In [None]:
plt.imshow(out[0][4])

In [None]:
#Sanity check: overfit on a batch of annotation
num = 64
batch_size = 10
inputs = reprs[0:num]
outputs = [classes[0:num], boxes[0:num]]
print(inputs.shape, outputs[0].shape, outputs[1].shape)
out = model.predict(x=inputs)
print(out[0].shape, out[1].shape)
model.fit(inputs, outputs, batch_size=batch_size, nb_epoch=50)

#model.fit(inputs, outputs, batch_size=1)

In [None]:
import matplotlib.pyplot as plt

def plot_annotations(img_path, annotation):
    img = imread(img_path)
    plt.imshow(img)
    currentAxis = plt.gca()
    for dic in annotation:
        color = "red" # ground truth
        conf = "gt "
        box = []
        text = "error"
        if 'confidence' in dic:
            conf = '{:0.2f} '.format(dic['confidence'])
            color = "green"
        for k,v in dic.items():
            if k!='confidence':
                text = k
                bbox = v
        if text=="error":
            print("error!")
            continue
        display_txt = conf + text
        coords = (bbox[0], bbox[1]), bbox[2]-bbox[0]+1, bbox[3]-bbox[1]+1
        currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
        currentAxis.text(bbox[0], bbox[1], display_txt, bbox={'facecolor':color, 'alpha':0.5})
        plt.axis('off')
    plt.show()
    

In [None]:
def display_ground_truth(index):
    plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     annotations[index]["objects"])

def display_yolo_prediction(index, threshold=0.5):
    res = model.predict(reprs[index][np.newaxis,])
    output = interpret_yolo_output(res[0], threshold=threshold, img_size=annotations[index]["size"])
    plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     output)
    
def display_prediction(index, threshold=0.5):
    res = model.predict(reprs[index][np.newaxis,])
    output = interpret_output(res[0][0], res[1][0], threshold=threshold, img_size=annotations[index]["size"])
    plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     output)

def display_both(index, threshold=0.5):
    res = model.predict(reprs[index][np.newaxis,])
    output = interpret_yolo_output(res[0], threshold=threshold, img_size=annotations[index]["size"])
    plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     output + annotations[index]["objects"])

In [None]:
out = model.predict(reprs[1][np.newaxis,])
print(np.max(out[0][:,:,6]*out[0][:,:,4]), np.max(out[0][:,:,4]), np.max(out[0][:,:,5]), np.max(out[0][:,:,6]))
plt.imshow(out[0][:,:,5]);

In [None]:
display_both(-7,threshold=0.01)

In [None]:
#Keep last examples for test
test_num = reprs.shape[0] // 10
train_num = reprs.shape[0] - test_num
print(train_num)

In [None]:
batch_size = 32
inputs_train = reprs[0:train_num]
outputs_train = yolo_gt[0:train_num]
model.fit(inputs_train, outputs_train, batch_size=batch_size, nb_epoch=50)

In [None]:
compute_acc(train=True)
compute_acc(train=False)

In [None]:
display_both(36, threshold=0.1)

In [None]:
out_cls, out_box = model.predict(reprs[28][np.newaxis,])
out_cls.shape, out_box.shape

In [None]:
def accuracy(pred, truth, threshold = 0.1):
    count_valid = 0
    count_total = 0
    for p, t in zip(pred, truth):
        for k in label2idx.keys():
            pred_coords = [dic[k] for dic in p if k in dic]
            true_coords = [dic[k] for dic in t if k in dic]
            if(len(pred_coords)==0 or len(true_coords)==0):
                continue
            matches = match_iou(pred_coords, true_coords)
            for match in matches:
                iou_value = iou(pred_coords[match[0]], true_coords[match[1]])
                if iou_value > threshold:
                    count_valid = count_valid + 1
        count_total = count_total + max(len(t), len(p))
    return count_valid / count_total
    
        
def mAP():
    return 0

In [None]:
def compute_acc(train=True):
    if train:
        beg, end = 0, (9 * len(annotations))
        txt = "train"
    else:
        beg, end = (9 * len(annotations)) // 10, len(annotations) 
        txt = "test"
    res = model.predict(reprs[beg:end])
    outputs = []
    for index, r in enumerate(res):
        output = interpret_yolo_output(r, threshold=0.2, img_size=annotations[]["size"])
        outputs.append(output)
    acc = accuracy(output, [ann["objects"] for ann in annotations[beg:end]], threshold=0.5)
    
    print(txt + ' accuracy: {:0.3f}'.format(acc))

In [None]:
compute_acc(train=True)

In [None]:
annotations[36]

In [None]:
index = 36
plot_annotations("VOCdevkit/VOC2007/JPEGImages/" + annotations[index]["filename"], 
                     annotations[index]["objects"])