## Yolo Darknet-19 model

We are the yolo2 Darknet-19 forward propagation as we believe it is the optimal tradeoff between predictive performance and computational performance. Further, given that this model is intended to be used for onboard processing, we are further reducing the complexity of the model by having it only learn a single bounding box shape, bounding the number of channels of the output at 29 - thus the output shape will be 19x19x29.

In [1]:
import numpy as np
import tensorflow as tf
from keras import backend as K
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
images = np.load("../data/kwf/images.npy")
encodings = np.load("../data/kwf/encodings.npy")
print(images.shape)
print(encodings.shape)

(2, 608, 608, 3)
(2, 19, 19, 29)


## Model architecture

### Model Forward Prop

In [3]:
# Placeholder values for input X,y data
def get_placeholders(x_h,x_w,x_c,y_h,y_w,y_c):
    """
    x_h: Height for x input 
    x_w: Width for x input
    x_c: Channels for x input
    y_h: Height for y input
    y_w: Width for y input
    y_c: Channels for y input
    """
    X = tf.placeholder(tf.float32, name="X", shape=(None,x_h,x_w,x_c))
    y = tf.placeholder(tf.float32, name="y", shape=(None,y_h,y_w,y_c))
    return X,y

In [4]:
# Defining constant layer for 2d convolution, batch norm, and activation
def conv(the_input,layer,f,ks):
    """
    the_input: the layer which will be used as input in conv layer
    layer: specifies the layer number for naming sections of graph
    f (filters): the number of filters to be used for conv layer
    ks (kernel_size): kernel size for conv2d layer
    Note - conv2d layers all use padding
    """
    layer = str(layer)
    Z = tf.layers.conv2d(the_input,filters=f,kernel_size=[ks,ks],strides=(1,1),padding="same",name="Z"+layer,kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))
    Bn = tf.layers.batch_normalization(Z,name="Bn"+layer)
    A = tf.nn.leaky_relu(Bn,alpha=0.1,name="A"+layer)
    return A

In [5]:
# Building the forward pass based on Darket-19
# Note - forward pass will use leaky_relu
def forward_pass(X):
    input_layer = tf.reshape(X,[-1,608,608,3]) # Input shape of images
    S1 = conv(input_layer,1,32,3)
    P1 = tf.layers.max_pooling2d(S1,pool_size=[2,2],strides=2,padding="valid",name="P1") # 224x224
    S2 = conv(P1,2,64,3)
    P2 = tf.layers.max_pooling2d(S2,pool_size=[2,2],strides=2,padding="valid",name="P2") # 112x112
    S3 = conv(P2,3,128,3)
    S4 = conv(S3,4,64,1)
    S5 = conv(S4,5,128,3)
    P5 = tf.layers.max_pooling2d(S5,pool_size=[2,2],strides=2,padding="valid",name="P5") # 56x56
    S6 = conv(P5,6,256,3)
    S7 = conv(S6,7,128,1)
    S8 = conv(S7,8,256,3)
    P8 = tf.layers.max_pooling2d(S8,pool_size=[2,2],strides=2,padding="valid",name="P8") # 28x28
    S9 = conv(P8,9,512,3)
    S10 = conv(S9,10,256,1)
    S11 = conv(S10,11,512,3)
    S12 = conv(S11,12,256,1)
    S13 = conv(S12,13,512,3)
    P13 = tf.layers.max_pooling2d(S13,pool_size=[2,2],strides=2,padding="valid",name="P13") #14x14
    S14 = conv(P13,14,1024,3)
    S15 = conv(S14,15,512,1)
    S16 = conv(S15,16,1024,3)
    S17 = conv(S16,17,512,1)
    S18 = conv(S17,18,2014,3)
    # Final layer - no batch norm, linear activation
    S19 = tf.layers.conv2d(S18,filters=29,kernel_size=[1,1],strides=(1,1),padding="valid",name="S19",activation=None)
    return S19

In [6]:
# Testing forward prop
tf.reset_default_graph()
ax = np.zeros((1,608,608,3))
X,y = get_placeholders(608,608,3,19,19,29)
Z19 = forward_pass(X)

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    aZ = sess.run(Z19,feed_dict={X:ax})
    print("Z shape:", str(aZ.shape))

Z shape: (1, 19, 19, 29)


### Model Cost

In [16]:
# Heavily penalizes negative predictions for height and weight
def cost_function(Z,y,coord=5,noobj=0.25):
    """
    Z - shape (None,19,19,29)
    y - shape (None,19,19,29)
    """
    c_mask_true = y[:,:,:,0:1] > 0
    c_mask_false = y[:,:,:,0:1] < 1
    confidence = tf.boolean_mask(Z[:,:,:,0:1],c_mask_true)
    confidence2 = tf.boolean_mask(y[:,:,:,0:1],c_mask_true)
    end = tf.square(confidence - confidence2)
    end2 = tf.reduce_sum(end)
    return (confidence,confidence2,end,end2)

In [17]:
# testing cost function
tf.reset_default_graph()
ax = images[0,:,:,:]
ax.shape = (1,608,608,3)
ay = encodings[0,:,:,:]
ay.shape = (1,19,19,29)

X,y = get_placeholders(608,608,3,19,19,29)
Z = forward_pass(X)
cost = cost_function(Z,y)

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    acost = sess.run(cost,feed_dict={X:ax,y:ay})
    print(acost[0])
    print(acost[1])
    print(acost[2])
    print(acost[3])

[1.1310066e-04 1.3068070e-04 1.1917770e-04 1.0849191e-04 1.2012606e-04
 1.3822227e-04 7.3459953e-05 3.7159654e-05]
[1. 1. 1. 1. 1. 1. 1. 1.]
[0.99977374 0.9997387  0.9997617  0.99978304 0.9997598  0.99972355
 0.99985313 0.99992573]
7.9983196
