In [92]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [93]:
import tensorflow as tf
import numpy as np

Use tf.dataset to get images and sort out all the directory stuff

Import pre-trained model and get the necessary weights

In [94]:
import os
data_root = 'gdrive/My Drive/ML/Projects/CSGO_aim/images'


In [95]:
BUFFER_SIZE = 500
BATCH_SIZE = 8
IMG_WIDTH = 448
IMG_HEIGHT = 448

In [96]:
def load(image_file):
  # height, width, channels
  image = tf.io.read_file(image_file)
  image = tf.image.decode_jpeg(image)

  input_image = tf.cast(image, tf.float32)

  return input_image

def resize(input_image, height, width):
  input_image = tf.image.resize(input_image, [height, width], 
                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
  return input_image

def normalize(input_image):
  input_image = (input_image / 127.5) - 1
  return input_image

def load_image_train(image_file):

  input_image = load(image_file)
  input_image = resize(input_image, IMG_WIDTH, IMG_HEIGHT)
  input_image = normalize(input_image)

  return input_image

In [97]:
def convert_label(label):
  label = list(map(int, label))
  # x coordinate
  # 640 is training image width
  label[0] = IMG_HEIGHT * label[0]/640 
  label[1] = IMG_WIDTH * label[1]/640
  # width and height rescaling
  label[2] = IMG_WIDTH * (label[2]/640) 
  label[3] = IMG_HEIGHT * (label[3]/640)

  label = list(map(int, label))

  return label

In [98]:
import os

imgs = []
bounds = []

for n, image_file in enumerate(os.listdir('gdrive/My Drive/ML/Projects/CSGO_aim/images/')):
  count = 10

  imgs.append(load_image_train(os.path.join('gdrive/My Drive/ML/Projects/CSGO_aim/images', image_file)))

  line_no = 'images\\' + image_file

  fr = open('gdrive/My Drive/ML/Projects/CSGO_aim/train.txt', 'r')

  ll = []

  for _, line in enumerate(fr):
    if line.split()[0] == str(line_no):
      for label in line.split()[1:]:
        count -= 1
        bounds.append(convert_label(label.split(',')))

  # invalid bounding box, padding
  for l in range(count):
    bounds.append([-1, -1, -1, -1, -1])
  
  #bounds.append(ll)

In [99]:
np.array(bounds).shape

(4340, 5)

In [100]:
new_bounds = []
for i in bounds:
  for j in i:
      new_bounds.append(j)

In [101]:
np.array(new_bounds).shape

(21700,)

In [102]:
new_bounds = np.array(new_bounds).reshape(n + 1, 10, 5)

In [103]:
tuples = []
for i, img in enumerate(imgs):
  #tuples.append((img, bounds[i]))
  tuples.append((img, new_bounds[i, :, :]))

In [104]:
tuples[0][1]

array([[205, 121, 270, 266,   1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1],
       [ -1,  -1,  -1,  -1,  -1]])

In [105]:
train_dataset = tf.data.Dataset.from_tensor_slices((imgs, new_bounds))
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)

In [106]:
'''#image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)
#image_data = image_generator.flow_from_directory(data_root, target_size=IMAGE_SHAPE)
#train_dataset = tf.data.Dataset.list_files('gdrive/My Drive/ML/Projects/CSGO_aim/images' + '/*.jpg')
#train_dataset = tf.data.Dataset.from_tensor_slices(tf.data.dataset.)

train_dataset = tf.data.Dataset.list_files('gdrive/My Drive/ML/Projects/CSGO_aim/images' + '/*.jpg')
train_dataset = train_dataset.map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)

print(os.listdir('gdrive/My Drive/ML/Projects/CSGO_aim/images'))'''

"#image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)\n#image_data = image_generator.flow_from_directory(data_root, target_size=IMAGE_SHAPE)\n#train_dataset = tf.data.Dataset.list_files('gdrive/My Drive/ML/Projects/CSGO_aim/images' + '/*.jpg')\n#train_dataset = tf.data.Dataset.from_tensor_slices(tf.data.dataset.)\n\ntrain_dataset = tf.data.Dataset.list_files('gdrive/My Drive/ML/Projects/CSGO_aim/images' + '/*.jpg')\ntrain_dataset = train_dataset.map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE)\ntrain_dataset = train_dataset.shuffle(BUFFER_SIZE)\ntrain_dataset = train_dataset.batch(BATCH_SIZE)\n\nprint(os.listdir('gdrive/My Drive/ML/Projects/CSGO_aim/images'))"

In [107]:
IMAGE_SHAPE = (448, 448)
B = 2 # 2 bounding boxes per grid cell
C = 1 
s = 3

model = None

if(os.path.exists('tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1')):
  model = tf.keras.models.load_model("gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt")
else:
  inputs = tf.keras.Input(shape=IMAGE_SHAPE + (3, ))
  x = tf.keras.layers.SeparableConv2D(3, 2, strides=(1, 1), padding='same')(inputs)
  x = tf.keras.layers.MaxPool2D(pool_size=(2, 2))(x)
  x = tf.keras.applications.MobileNetV2(#input_shape=IMAGE_SHAPE + (3, ),
                                                include_top=False, weights='imagenet')(x)
  x = tf.keras.layers.Conv2D(64, 3, strides=(2, 2))(x)

  # When applied on top of a Conv2D layer, this is basically a 1x1 feature detector
  # with 2048 feature maps, except the feature detectors are not constrained (since 
  # Conv2D layers impose more constraints)
  x = tf.keras.layers.Dense(256)(x)
  # B is the # of bounding boxes per grid cell, 5 is the amount of things we predict, 
  # x, y, w, h, and confidence
  output_layer_1 = tf.keras.layers.Dense(B * 5, activation='linear')(x)
  # and C is # of classes -- just 1 due to binary
  output_layer_2 = tf.keras.layers.Dense(C, activation='sigmoid')(x)

  model = tf.keras.Model(inputs=inputs, outputs=[output_layer_1, output_layer_2])



In [108]:
for layer in model.layers:
  print(layer.name)
model.get_layer('mobilenetv2_1.00_224').trainable = False

input_7
separable_conv2d_3
max_pooling2d_3
mobilenetv2_1.00_224
conv2d_3
dense_9
dense_10
dense_11


In [109]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 448, 448, 3) 0                                            
__________________________________________________________________________________________________
separable_conv2d_3 (SeparableCo (None, 448, 448, 3)  24          input_7[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)  (None, 224, 224, 3)  0           separable_conv2d_3[0][0]         
__________________________________________________________________________________________________
mobilenetv2_1.00_224 (Model)    multiple             2257984     max_pooling2d_3[0][0]            
____________________________________________________________________________________________

In [110]:
sample_imgs = train_dataset.take(1)

for X, y in sample_imgs:

  print(np.array(model(X)[0]).shape)
  print(np.array(model(X)[1]).shape)
  print(np.array(y).shape)
  print(y)
  for item in X:
    print(np.array(item).shape)
    break

(8, 3, 3, 10)
(8, 3, 3, 1)
(8, 10, 5)
tf.Tensor(
[[[245 131 289 232   1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]]

 [[255 111 279 162   0]
  [113  75 178 191   0]
  [ 58 106  82 137   0]
  [ 95 106 121 151   0]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]]

 [[205  93 227 128   0]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]]

 [[213 119 281 201   1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -1  -1  -1  -1]
  [ -1  -

We have an architecture which outputs a 7x7x11 grids, with the grids determining the bounding box coordinates, and the classification determining what class the object is



In [111]:
# In an sxs grid, this identifies which grid is responsible; only one bounding box per grid is responsible
# label expectation: [(x, y, w, h)] unnormalized
# returns list of responsible grids
def get_responsible_grids(input_img, labels, s): 
  responsible_grids = []

  for obj in labels:
    # x and y are the center of the object
    sw = obj[0] // (IMG_WIDTH / s) 
    sh = obj[1] // (IMG_HEIGHT / s) 
    responsible_grids.append((int(sw), int(sh)))
    
  return responsible_grids


In [112]:
get_responsible_grids(0, [(300, 300, 100, 100)], 3)

[(2, 2)]

The ground truth needs to be made in such a way that bounding boxes which are not responsible are not penalized (the gradient does not affect them). This can be done by watching only the parameters which yield the "responsible" grids.  

The hard part is figuring out how to implement selective gradients for only the parameters which yield responsible boxes. Based on the loss function: 

*   coordinate error and width/height error are only penalized for the RESPONSIBLE bounding box if an object's center appears in the grid
*   Confidence error is weighted more heavily if an object appears in the image (and should be 0 if the object does not appear in the image; furthermore, both boxes take the loss for confidence)
*   The classifier is only penalized if an object appears in the grid




Penalizing confidence: check if the grid which the bounding box belongs to is responsible - if not, then scale the confidence loss down by lambda 
If the grid is responsible, find which of the bounding boxes is responsible by calculating the intersection over union

Found an IOU utility function here: https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc

In [113]:
# Intersection over union
# The parameters are not width and height, but rather the upper left and 
# lower right coords
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou

In [114]:
bb_intersection_over_union([0, 0, 200, 200], [100, 100, 300, 300])

0.14285714285714285

We need to create a function which normalizes the width and height prediction of the network between 0 and 1 so that all components equally contribute to the loss function. 

This function takes in a label, and outputs x-y from 0 to 1 in the range of the grid cell, and a width and height relative to the entire image.

In [115]:
# Given label NOT IN A LIST FORM
def normalize_label(label, s):
  responsible_grid = get_responsible_grids(0, [label], s)[0]
  
  x = label[0]
  y = label[1]
  w = label[2]
  h = label[3]

  x = (x % (IMG_WIDTH // s)) / (IMG_WIDTH // s)
  y = (y % (IMG_WIDTH // s)) / (IMG_HEIGHT // s)

  w /= IMG_WIDTH
  h /= IMG_HEIGHT

  return x, y, w, h

In [116]:
normalize_label((64,64,50,50), 7)

(0.0, 0.0, 0.11160714285714286, 0.11160714285714286)

For each label in the input image, figure out which grid is responsible and then figure out which bounding box is responsible

One of the hard parts in this was normalizing the box coordinates for when the grid contains the bounding box

A lot of issues here. I am only processing one image at a time in order to make the code simpler. However, tensorflow functions require whatever they are processing to have some shape which also accounts for the batch size. I mitigate most of that here by indexing in order to compute the majority of the loss function. When calculating binary crossentropy, I have to convert label[-1] in to batch tensor form (None, ...) as well as my predictions since I am directly using a tensorflow function.

In [117]:
loss_object = tf.keras.losses.BinaryCrossentropy()

# Simple sxs grid with x, y, w, h, confidence level, and class for each image
# The ground truth needs to be made in such a way that bounding boxes which 
# are not responsible are not penalized, and same with the classification

def compute_loss(predictions_1, predictions_2, input_img, labels, s):
  #ground_truth_output_1 = np.empty(shape=(None, s, s, B * 5))
  #ground_truth_output_2 = np.empty(shape=(None, s, s, C))
  
  # All bounding boxes in the grid

  # contains responsible bounding box
  bounding_boxes = []
  indexes = []
  max_ious = []
  responsible_box = None
  loss = 0

  lambda_coord = 5
  lambda_noobj = 0.5

  # for each label
  for label in labels: 
    if label[0] == -1:
      return loss
    # for each grid
    max_iou = 0 

    for i in range(s):
      for j in range(s):
        # If this is a responsible grid/ an object is in this grid
        wh_loss = 0
        xy_loss = 0
        confidence_loss = 0
        classification_loss = 0

        if get_responsible_grids(input_img, [label], s)[0] == (i, j): #) != -1:
          # for each bounding box
          for k in range(B):
            bb = predictions_1[0, i, j, k*5:(k+1)*5]
            #bounding_boxes.append(bb)

            x1, y1 = bb[0] - bb[2] // 2, bb[1] - bb[3] // 2  
            x2, y2 = bb[0] + bb[2] // 2, bb[1] + bb[3] // 2  
            box1 = (x1, y1, x2, y2)
            labelx1, labely1 = label[0] - label[2] // 2, label[1] - label[3] // 2  
            labelx2, labely2 = label[0] + label[2] // 2, label[1] + label[3] // 2 


            #predicted_confidence = bb[(k+1)*5-1] # predicted confidence
            predicted_confidence = bb[(0+1)*5-1]

            true_confidence_iou = bb_intersection_over_union((x1, y1, x2, y2), (labelx1, labely1, labelx2, labely2))
            
            if true_confidence_iou >= max_iou:
              bounding_boxes.append((i, j, k))

              label = normalize_label(label, s)
              label = tf.convert_to_tensor(label, dtype=tf.float32)

              # xy coordinate squared difference (centered)
              xy_loss = lambda_coord * ((bb[0] - label[0])**2 + (bb[1] - label[1])**2)

              # in order to take the square root
              sqrt_w = tf.clip_by_value(bb[2], 0, 100)
              sqrt_h = tf.clip_by_value(bb[3], 0, 100)

              #wh_loss = lambda_coord * ((tf.math.sqrt(bb[2]) - tf.math.sqrt(label[2]))**2 
              #                         + (tf.math.sqrt(bb[3]) - tf.math.sqrt(label[3]))**2)
              wh_loss = lambda_coord * (sqrt_w - tf.math.sqrt(label[2]))**2 + (sqrt_h - tf.math.sqrt(label[3]))**2
              # Confidence of responsible boxes
              confidence_loss = (true_confidence_iou - predicted_confidence) ** 2

          # classification loss
          classification_loss = loss_object(tf.convert_to_tensor([label[-1]]), 
                                            tf.convert_to_tensor([predictions_2[0, i, j, 0]]))
        else: 
           # for each bounding box
          for k in range(B):
            bb = predictions_1[0, i, j, k*5:(k+1)*5]
            #bounding_boxes.append(bb)

            x1, y1 = bb[0] - bb[2] // 2, bb[1] - bb[3] // 2  
            x2, y2 = bb[0] + bb[2] // 2, bb[1] + bb[3] // 2  
            box1 = (x1, y1, x2, y2)
            labelx1, labely1 = label[0] - label[2] // 2, label[1] - label[3] // 2  
            labelx2, labely2 = label[0] + label[2] // 2, label[1] + label[3] // 2 

  

            #predicted_confidence = bb[(k+1)*5-1] # predicted confidence
            predicted_confidence = bb[(0+1)*5-1]

            true_confidence_iou = bb_intersection_over_union((x1, y1, x2, y2), (labelx1, labely1, labelx2, labely2))
            
            if true_confidence_iou >= max_iou:
              bounding_boxes.append((i, j, k))

              label = normalize_label(label, s)
              label = tf.convert_to_tensor(label, dtype=tf.float32)

              # Confidence of responsible boxes
              confidence_loss = lambda_noobj * ((true_confidence_iou - predicted_confidence) ** 2)

          # classification loss
          classification_loss = 0
  
        # only one box is responsible for confidence
        loss += (confidence_loss + xy_loss + wh_loss + classification_loss)
  return loss

In [118]:
for sample in train_dataset.take(1):
  outputs = model(sample)
  # label must be passed in as a list of tuples
  loss = compute_loss(outputs[0], outputs[1], 0, [[50, 50, 100, 100, 0]], s)

print(loss)

tf.Tensor(75.35901, shape=(), dtype=float32)


In [119]:
checkpoint_path = "gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt"

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)


def train(steps):
  optimizer = tf.keras.optimizers.Adam(1e-4)
 
  for n in range(steps):
    with tf.GradientTape(persistent=True) as tape:
      for img_batch, labels in train_dataset.take(1):
        loss = 0
        for img, label in zip(img_batch, labels):
          img = tf.expand_dims(img, 0)
          #with tf.GradientTape() as tape: 
          outputs = model(img)
          loss += compute_loss(outputs[0], outputs[1], img, label.numpy(), s)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    gradients = [grad if grad is not None else tf.zeros_like(var)
      for var, grad in zip(model.trainable_variables, gradients)]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    if n % 50 == 0:
      model.save(checkpoint_path)

In [120]:
train(2000)

INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/save/training_1/cp.ckpt/assets
INFO:tensorflow:Assets written to: gdrive/My Drive/ML/Projects/CSGO_aim/s

In [121]:
for img_batch, labels in train_dataset.take(1):

  img = img_batch[0]
  
  img = tf.expand_dims(img, 0)
  output_1, output_2 = model(img)

  # print(output_1[0][0][0][:5]) # bounding box #1 5 predictions
  # print(output_2[0][1][2][0])

  for row in range(s):
    for col in range(s):
      # Confident that there is an object in there
      if output_1[0][row][col][4] > 0.5:
        print('Model is confident', output_1[0][row][col][4])
        model_coords = output_1[0][row][col][:4]
        true_y = row * (IMG_HEIGHT // s) + (tf.clip_by_value(model_coords[0], 0, 1) ** 2) * (IMG_HEIGHT // s)
        true_x = col * (IMG_WIDTH // s) + (tf.clip_by_value(model_coords[1], 0, 1) ** 2) * (IMG_WIDTH // s) 
        print(true_x, true_y)
        #print(output_1[0][row][col][:4])

  print(labels[0].numpy())

  #print(get_responsible_grids(0, labels[0].numpy(), s))

  #print(labels[0].numpy())



Model is confident tf.Tensor(2.7508774, shape=(), dtype=float32)
tf.Tensor(149.0, shape=(), dtype=float32) tf.Tensor(149.0, shape=(), dtype=float32)
Model is confident tf.Tensor(2.1648984, shape=(), dtype=float32)
tf.Tensor(298.0, shape=(), dtype=float32) tf.Tensor(0.0, shape=(), dtype=float32)
Model is confident tf.Tensor(2.3041263, shape=(), dtype=float32)
tf.Tensor(298.0, shape=(), dtype=float32) tf.Tensor(0.0, shape=(), dtype=float32)
Model is confident tf.Tensor(1.5411993, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32) tf.Tensor(298.0, shape=(), dtype=float32)
[[279 137 320 191   0]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]
 [ -1  -1  -1  -1  -1]]


In [122]:
import cv2

img = cv2.imread('gdrive/My Drive/ML/Projects/CSGO_aim/images/1.jpg')
img.dtype

dtype('uint8')