#### Note - This model is actually used for this project and uses the Darknet-19 forward prop outlined in the YOLO_v2 paper.

## YOLO_v2 inspired model

- This model takes in input images of shape (448,448,3) and produces output of shape (14,14,14). In looking at the output of my k-means clustering step in my minibatch data cleaning notebook, I decided to have 2 (1,1,7) encodings such that the model can learn the difference between the dimensions of open and closed palms.

- I will be using stochastic gradient descent with adam optimization and an interatively decreasing learning rate as promoted by the YOLO_v2 model.

- This model is essentially the Darknet-19 model, which I chose over the Darknet-53 resnet model strictly to decrease the time needed for model training.

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from keras import backend as K
import matplotlib.pyplot as plt
import latex
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# Loading data
X = np.load("../../data/dinorunner/images_448.npy")
y = np.load("../../data/dinorunner/encodings_448.npy")
print(X.shape)
print(y.shape)

(396, 448, 448, 3)
(396, 14, 14, 7)


In [3]:
# Creating testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(376, 448, 448, 3)
(376, 14, 14, 7)
(20, 448, 448, 3)
(20, 14, 14, 7)


### Tensorflow Placeholders

In [4]:
# Placeholder values for input X,y data
def get_placeholders(x_h,x_w,x_c,y_h,y_w,y_c):
    """
    x_h: Height for x input 
    x_w: Width for x input
    x_c: Channels for x input
    y_h: Height for y input
    y_w: Width for y input
    y_c: Channels for y input
    """
    X = tf.placeholder(tf.float32, name="X", shape=(None,x_h,x_w,x_c))
    y = tf.placeholder(tf.float32, name="y", shape=(None,y_h,y_w,y_c))
    return X,y

In [5]:
# Testing placeholders
tf.reset_default_graph()
with tf.Session() as sess:
    X,y = get_placeholders(448,448,3,14,14,7)
    print("X shape:",X.shape)
    print("y shape:",y.shape)

X shape: (?, 448, 448, 3)
y shape: (?, 14, 14, 7)


### Tensorflow Forward Propagation

In [6]:
# Defining constant layer for 2d convolution, batch norm, and activation
def conv(the_input,layer,f,ks):
    """
    the_input: the layer which will be used as input in conv layer
    layer: specifies the layer number for naming sections of graph
    f (filters): the number of filters to be used for conv layer
    ks (kernel_size): kernel size for conv2d layer
    Note - conv2d layers all use padding
    """
    layer = str(layer)
    Z = tf.layers.conv2d(the_input,filters=f,kernel_size=[ks,ks],strides=(1,1),padding="same",name="Z"+layer,kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))
    Bn = tf.layers.batch_normalization(Z,name="Bn"+layer)
    A = tf.nn.leaky_relu(Bn,alpha=0.1,name="A"+layer)
    return A

In [7]:
# Building the forward pass based on Darket-19
# Note - forward pass will use leaky_relu
def forward_pass(X):
    input_layer = tf.reshape(X,[-1,448,448,3]) # Input shape of images
    S1 = conv(input_layer,1,32,3)
    P1 = tf.layers.max_pooling2d(S1,pool_size=[2,2],strides=2,padding="valid",name="P1") # 224x224
    S2 = conv(P1,2,64,3)
    P2 = tf.layers.max_pooling2d(S2,pool_size=[2,2],strides=2,padding="valid",name="P2") # 112x112
    S3 = conv(P2,3,128,3)
    S4 = conv(S3,4,64,1)
    S5 = conv(S4,5,128,3)
    P5 = tf.layers.max_pooling2d(S5,pool_size=[2,2],strides=2,padding="valid",name="P5") # 56x56
    S6 = conv(P5,6,256,3)
    S7 = conv(S6,7,128,1)
    S8 = conv(S7,8,256,3)
    P8 = tf.layers.max_pooling2d(S8,pool_size=[2,2],strides=2,padding="valid",name="P8") # 28x28
    S9 = conv(P8,9,512,3)
    S10 = conv(S9,10,256,1)
    S11 = conv(S10,11,512,3)
    S12 = conv(S11,12,256,1)
    S13 = conv(S12,13,512,3)
    P13 = tf.layers.max_pooling2d(S13,pool_size=[2,2],strides=2,padding="valid",name="P13") #14x14
    S14 = conv(P13,14,1024,3)
    S15 = conv(S14,15,512,1)
    S16 = conv(S15,16,1024,3)
    S17 = conv(S16,17,512,1)
    S18 = conv(S17,18,2014,3)
    # Final layer - no batch norm, linear activation
    S19 = tf.layers.conv2d(S18,filters=14,kernel_size=[1,1],strides=(1,1),padding="valid",name="S19",activation=None)
    return S19

In [8]:
# Testing forward prop
tf.reset_default_graph()
with tf.Session() as sess:
    np.random.seed(1)
    X,y = get_placeholders(448,448,3,14,14,7)
    Z19 = forward_pass(X) # Computation graph
    init = tf.global_variables_initializer()
    sess.run(init)
    aZ = sess.run(Z19,feed_dict={X:np.random.randn(1,448,448,3),y:np.random.randn(1,14,14,7)})
    print("Z shape:", str(aZ.shape))

Z shape: (1, 14, 14, 14)


### Tensorflow Cost Function

Using the predicted box [shape(1,1,7)] with the highest IoU score to determine the cost for a given image.

Note - I deviate from the YOLO function function in that for all of the cells that don't hold the actual prediction for the bounding box, I penalize ALL of the confidence predictions (2 per cell in this case). I also penalize the prediction that is not used for the cell that holds the actual prediction. In order to compensate for this change and not greatly increase the cost from just wrong confidence scores I decrease the lambda<sub>noobj</sub> from 0.5 to 0.25. This is due to the fact that I am factoring in 2 times the previous number of confidence scores.

In [9]:
# Returns the sum of all of the confidence scores corresponding w/ cells that don't have prediction
def get_false_conf(Z,mask):
    conf_1 = tf.boolean_mask(Z[:,:,:,0:1],mask)
    conf_2 = tf.boolean_mask(Z[:,:,:,7:8],mask)
    conf_sum = tf.add(tf.reduce_sum(conf_1),tf.reduce_sum(conf_2))
    return conf_sum

In [10]:
# Returns the values with a specific mask applied to it
def get_box_values(box,mask):
    """
    Index:
    0: confidence there is an object in cell, 1: mid_x, 2: mid_y, 
    3: width, 4: length, 5: prob_open_palm, 6: prob_close_palm
    """
    confidence = tf.boolean_mask(box[:,:,:,0:1],mask)
    mid_x = tf.boolean_mask(box[:,:,:,1:2],mask)
    mid_y = tf.boolean_mask(box[:,:,:,2:3],mask)
    width = tf.boolean_mask(box[:,:,:,3:4],mask)
    height = tf.boolean_mask(box[:,:,:,4:5],mask)
    prob_open = tf.boolean_mask(box[:,:,:,5:6],mask)
    prob_closed = tf.boolean_mask(box[:,:,:,6:7],mask)
    box = {"co":confidence, "mx":mid_x,"my":mid_y,"w":width,"h":height,"d":prob_open,"c":prob_closed}
    return box

In [11]:
# Returns the IoU
def get_iou(box1, box2):
    """
    box1 - coordinates (x1, y1, x2, y2)
    box2 - coordinates (x1, y1, x2, y2)
    """
    xi1 = tf.maximum(box1["x1"], box2["x1"])
    yi1 = tf.maximum(box1["y1"], box2["y1"])
    xi2 = tf.minimum(box1["x2"], box2["x2"])
    yi2 = tf.minimum(box1["y2"], box2["y2"])
    inter_area = tf.maximum((xi2 - xi1),0) * tf.maximum((yi2 - yi1),0) # no neg bounding box

    box1_area = (box1["x2"]-box1["x1"]) * (box1["y2"] - box1["y1"])
    box2_area = (box2["x2"]-box2["x1"]) * (box2["y2"] - box2["y1"])
    union_area = box1_area + box2_area - inter_area
    iou = inter_area / union_area
    
    return tf.reshape(iou,[])

In [12]:
# Returns the (x1,y1),(x2,y2) coordinates for each bounding box as a dict
def get_xy(box_v):
    mid_x = box_v["mx"]
    mid_y = box_v["my"]
    width = box_v["w"]
    height = box_v["h"]
    width = width * 448
    height = height * 448
    mid_x = mid_x * 32 + 224
    mid_y = mid_y * 32 + 224
    
    x1 = mid_x - (1/2*width)
    x2 = mid_x + (1/2*width)
    y1 = mid_y - (1/2*height)
    y2 = mid_y + (1/2*height)
    
    box_xy = {"x1":x1,"x2":x2,"y1":y1,"y2":y2}
    return box_xy

In [13]:
# This returns the information associated with the bounding box prediction with the max IoU
def get_max_iou(box1, box2, y, mask):
    """
    b1_v: bounding box values associated with box1
    b2_v: bounding box values associated with box2
    y_v: bounding box values associated with y
    """
    b1_v = get_box_values(box1,mask)
    b2_v = get_box_values(box2,mask)
    y_v = get_box_values(y,mask)
    # These new coordinates will be used to get the IoU
    b1_xy = get_xy(b1_v)
    b2_xy = get_xy(b2_v)
    y_xy = get_xy(y_v)
    # Getting the Iou for each bounding box prediction
    b1_iou = get_iou(b1_xy, y_xy)
    b2_iou = get_iou(b2_xy, y_xy)
    # Comparing the ious to determine which guess is the ground truth prediction
    def b1(): return b1_v, b2_v
    def b2(): return b2_v, b1_v
    highest_iou_values, lowest_values = tf.cond(tf.less(b1_iou,b2_iou), b2, b1)
    return highest_iou_values, lowest_values

In [14]:
# Stochastic gradient descent so input of shape (1,14,14,14)
# Heavily penalizes negative predictions for height and weight
# Bounding box prediction based on guess with the highest IoU
def cost_function(Z,y,coord=5,noobj=0.25):
    """
    Z - shape (?,14,14,14)
    y - shape (?,14,14,7)
    """
    c_mask_true = y[:,:,:,0:1] > 0 # which cell has the encoding
    c_mask_false = y[:,:,:,0:1] < 1
    
    box1 = Z[:,:,:,0:7] # First guess
    box2 = Z[:,:,:,7:14] # Second guess
    y_v = get_box_values(y,c_mask_true) # values for y for cell with object
    m_v,lowest_values = get_max_iou(box1,box2,y,c_mask_true) # values for highest IoU guess
    false_conf_score = get_false_conf(Z,c_mask_false) # sum of false confidence predictions
    false_conf_score = false_conf_score + tf.reshape(lowest_values["co"],[])
    
    # penalizing width,length predictions if negative
    m_v["w"] = tf.sqrt(tf.maximum(m_v["w"],0.0))
    m_v["l"] = tf.sqrt(tf.maximum(m_v["h"],0.0))
    
    y_v["w"] = tf.sqrt(y_v["w"])
    y_v["l"] = tf.sqrt(y_v["h"])
    
    # correspond to individual summations of the cost function:
    part1 = coord * tf.reduce_sum(tf.square(y_v["mx"]-m_v["mx"])+tf.square(y_v["my"]-m_v["my"]))
    part2 = coord * tf.reduce_sum(tf.square(y_v["w"]-m_v["w"])+tf.square(y_v["l"]-m_v["l"]))
    part3 = tf.reduce_sum(tf.square(y_v["co"]-m_v["co"]))
    part4 = noobj * false_conf_score
    part5 = tf.reduce_sum(tf.add(tf.square(y_v["d"]-m_v["d"]),tf.square(y_v["c"]-m_v["c"])))
    total_cost = part1 + part2 + part3 + part4 + part5

    return total_cost

In [15]:
# Testing cost function
# predicted cost w/ rounding error is 1.6396
ay = np.zeros((1,14,14,7))
ay[0,0,0,:] = np.array([1,0.5,0.5,0.25,0.25,1,0]) # top left corner
az = np.zeros((1,14,14,14))
az[0,0,0,0:7] = np.array([0.8,0.35,0.35,0.2,0.2,0.8,0.2]) # PRED 1
az[0,0,0,7:] = np.array([0.5,0.4,0.4,0.22,0.22,0.8,0.2]) # PRED 2
az[0,0,1,0] = 1
az[0,1,0,0] = 1
az[0,1,0,7] = 1
az[0,2,0,7] = 1

with tf.Session() as sess:
    y = tf.placeholder(tf.float32,shape=(None,14,14,7))
    Z = tf.placeholder(tf.float32,shape=(None,14,14,14))
    aCost = cost_function(Z,y)
    init = tf.global_variables_initializer()
    sess.run(init)
    tot = sess.run(aCost,feed_dict={Z:az,y:ay})
    print(tot)

1.6395843


### Tensorflow Model

In [20]:
# Building and training YOLO model using stochastic gradient descent
def model(X_train,y_train,lr=0.001,num_epochs=50,print_cost=True):
    tf.reset_default_graph() # resetting graph
    tf.set_random_seed(1)
    seed=0
    costs=[]
    x_h = X_train[0].shape[0]
    x_w = X_train[0].shape[1]
    x_c = X_train[0].shape[2]
    y_h = y_train[0].shape[0]
    y_w = y_train[0].shape[1]
    y_c = y_train[0].shape[2]
    m = X_train.shape[0]
    
    X,y = get_placeholders(x_h,x_w,x_c,y_h,y_w,y_c)
    Z = forward_pass(X)
    cost = cost_function(Z,y)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(cost)
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver() # to save/load model
    with tf.Session() as sess:
        # Loading saved model
        #saver = tf.train.import_meta_graph("../../data/dinorunner/models_stochastic/yolo_model.ckpt.meta")
        #saver.restore(sess, "../../data/dinorunner/models_stochastic/yolo_model.ckpt")
        sess.run(init) # DON'T RUN INIT IF LOADING MODEL
        
        for epoch in range(num_epochs):
            running_cost = 0
            # shuffling training set order each iteration
            seed += 1
            X_train = shuffle(X_train, random_state=seed) 
            y_train = shuffle(y_train, random_state=seed)
            
            for i in range(X_train.shape[0]):
                aX = X_train[i]
                aX.shape = (1,448,448,3)
                aY = y_train[i]
                aY.shape = (1,14,14,7)
                _,temp_cost = sess.run([optimizer,cost], feed_dict={X:aX,y:aY})
                running_cost += temp_cost
                
            costs.append(running_cost)
            if print_cost and epoch % 1 == 0:
                print("Cost at epoch {}: {}".format(epoch+1,running_cost))
                
        loc = saver.save(sess, "../../data/dinorunner/models_stochastic/yolo_model.ckpt")
        return costs

In [21]:
print(X_test.shape)
print(y_test.shape)

(20, 448, 448, 3)
(20, 14, 14, 7)


In [22]:
acosts = model(X_test,y_test,lr=0.001,num_epochs=50,print_cost=True)

Cost at epoch 1: 2265074.9516203403
Cost at epoch 2: 31078782.278289795
Cost at epoch 3: 4848.251556396484
Cost at epoch 4: 1090557.9871902466
Cost at epoch 5: -38567.39768600464
Cost at epoch 6: 23804281.07373047


KeyboardInterrupt: 