<a href="https://colab.research.google.com/github/pirate2580/asl_classification/blob/main/YOLOTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import tensorflow as tf

from google.colab import drive
drive.mount('/content/drive')
os.chdir("drive/My Drive/ASL_project")
os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['asl_alphabet_train',
 'trainx.npy',
 'trainy.npy',
 'YOLO_predictions.ipynb',
 'train_data',
 'valx.npy',
 'valy.npy',
 'testx.npy',
 'testy.npy',
 'yolo_resnet_50.h5',
 'YOLOASL.ipynb']

In [None]:
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
labels = {alphabet[i]: i for i in range(len(alphabet))}

B = 1 # no. of bounding boxes
N_CLASSES = 26                    # number of classes
H, W = 224, 224
SCREEN_HEIGHT = H
SCREEN_WIDTH = W
SPLIT_SIZE = H//32 # S=7
N_EPOCHS = 135
BATCH_SIZE = 32



In [None]:
# List of file names
in_file_names = [('train_data/' + str.lower(i) + '.npy') for i in alphabet]
out_file_names = [('train_data/' + str.lower(i) + '_lab.npy') for i in alphabet]

# Load data from each file into a list
in_data_list = [np.load(file_name) for file_name in in_file_names]
out_data_list = [np.load(file_name) for file_name in out_file_names]

# Concatenate the list of arrays along the desired axis (e.g., axis=0 for stacking vertically)
train_images = np.concatenate(in_data_list, axis=0)
train_labels = np.concatenate(out_data_list, axis=0)

# images to check the loss on over epochs
val_images = np.load('valx.npy')
val_labels = np.load('valy.npy')


N_EXAMPLES = train_images.shape[0]


#print(bboxes)

In [None]:
def find_relative_bboxes(labels):
  """
  Returns the original bounding boxes (matched by index) for each training label
  along with the normalized training labels
  """
  bounding_boxes = []
  for i in range(labels.shape[0]):
    xmin = labels[i][1]
    ymin = labels[i][2]
    xmax = labels[i][1] + labels[i][3]
    ymax = labels[i][2] + labels[i][4]

    bounding_box = [
      (xmin+xmax)/(2 * SCREEN_WIDTH),
      (ymin+ymax)/(2 * SCREEN_HEIGHT),
      (xmax-xmin)/ SCREEN_WIDTH,
      (ymax-ymin)/ SCREEN_HEIGHT
    ]
    bounding_boxes.append(bounding_box)


  return np.stack(bounding_boxes, axis = 0)

In [None]:
train_bboxes = find_relative_bboxes(train_labels)
val_bboxes = find_relative_bboxes(val_labels)

In [None]:
def generate_output(bounding_boxes, labels):

  # N_EXAMPLESx7x7x31 tensor
  output_label = np.zeros((bounding_boxes.shape[0],SPLIT_SIZE, SPLIT_SIZE, N_CLASSES + 5 * B))

  for b in range(len(bounding_boxes)):

    grid_x = bounding_boxes[b,0]*SPLIT_SIZE
    grid_y = bounding_boxes[b,1]*SPLIT_SIZE

    # i and j return the grid it belongs to
    i = int(grid_x)
    j = int(grid_y)
    if (i >= SPLIT_SIZE):  i = SPLIT_SIZE - 1
    if (i < 0): i = 0
    if (j >= SPLIT_SIZE):  j = SPLIT_SIZE - 1
    if (j < 0): j = 0
    output_label[b, i, j, 0:5] = (1., grid_x % 1, grid_y % 1, bounding_boxes[b,2], bounding_boxes[b,3])

    # assigns 1 to the appropriate label
    output_label[b, i, j, 5:] = labels[b][5:]

  return tf.convert_to_tensor(output_label, tf.float64)

In [None]:
train_output = generate_output(train_bboxes, train_labels)
val_output = generate_output(val_bboxes, val_labels)

In [None]:
# Create TensorFlow Datasets from NumPy array and TensorFlow tensor
images_dataset = tf.data.Dataset.from_tensor_slices(train_images) # for all n images
labels_dataset = tf.data.Dataset.from_tensor_slices(generate_output(train_bboxes, train_labels)) # generates label for 1 image

val_images_dataset = tf.data.Dataset.from_tensor_slices(val_images) # for all n images
val_labels_dataset = tf.data.Dataset.from_tensor_slices(generate_output(val_bboxes, val_labels)) # generates label for 1 image

# Combine the two datasets into a single dataset
train_dataset = tf.data.Dataset.zip((images_dataset, labels_dataset))
val_dataset = tf.data.Dataset.zip((val_images_dataset, val_labels_dataset))

train_dataset = (
    train_dataset.
    batch(BATCH_SIZE).
    prefetch(tf.data.AUTOTUNE)
)

val_dataset = (
    val_dataset.
    batch(BATCH_SIZE).
    prefetch(tf.data.AUTOTUNE)
)

In [None]:
NUM_FILTERS = 512
OUTPUT_DIM = N_CLASSES + 5*B

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50, MobileNet
from tensorflow.keras.layers import Conv2D, BatchNormalization, LeakyReLU, Flatten, Dense, Reshape, Dropout

In [None]:
base_model = tf.keras.applications.MobileNet(
    weights = 'imagenet',
    input_shape = (H, W, 3),
    include_top = False
)
base_model.trainable = True

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5


In [None]:
model = tf.keras.Sequential([
    base_model,
    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    LeakyReLU(alpha = 0.1),

    Flatten(),


    Dense(NUM_FILTERS, kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Dropout(0.4),
    Dense(SPLIT_SIZE * SPLIT_SIZE * OUTPUT_DIM, activation = 'sigmoid'),
    Dropout(0.4),
    Reshape((SPLIT_SIZE, SPLIT_SIZE, OUTPUT_DIM)),

])

model.summary()

# YOLO-like model built on a MobileNet backbone pretrained on ImageNet dataset

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mobilenet_1.00_224 (Functi  (None, 7, 7, 1024)        3228864   
 onal)                                                           
                                                                 
 conv2d (Conv2D)             (None, 7, 7, 512)         4719104   
                                                                 
 batch_normalization (Batch  (None, 7, 7, 512)         2048      
 Normalization)                                                  
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 7, 7, 512)         0         
                                                                 
 conv2d_1 (Conv2D)           (None, 7, 7, 512)         2359808   
                                                                 
 batch_normalization_1 (Bat  (None, 7, 7, 512)         2

In [None]:
def difference(x, y):
  return tf.reduce_sum(tf.square(y-x))

In [None]:
def compute_IOU(boxes1, boxes2):
  """
  This function calculates the intersection over union of two bounding boxes
  that are of type tensor
  """
  boxes1_t = tf.stack([boxes1[...,0] - boxes1[..., 2] / 2.0,
                       boxes1[...,1] - boxes1[..., 3] / 2.0,
                       boxes1[...,0] + boxes1[..., 2] / 2.0,
                       boxes1[...,1] + boxes1[..., 3] / 2.0],
                      axis = -1)

  boxes2_t = tf.stack([boxes2[...,0] - boxes2[...,2] / 2.0,
                       boxes2[...,1] - boxes2[...,3] / 2.0,
                       boxes2[...,0] + boxes2[...,2] / 2.0,
                       boxes2[...,1] + boxes2[...,3] / 2.0],
                      axis = -1)

  lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
  rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])

  intersection = tf.maximum(0.0, rd - lu)

  inter_square = intersection[..., 0] * intersection[..., 1]

  square1 = boxes1[..., 2] * boxes1[..., 3]
  square2 = boxes2[..., 2] * boxes2[..., 3]

  union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
  return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)

In [None]:
def yolo_loss(y_true, y_pred):
  '''
  y_true = 7x7x31 where 31 is from [p, bx, by, bw, bh, 26 classes...]
  y_pred = 7x7x31 where 31 is from [p, bx, by, bw, bh, 26 classes...]
  '''
  # Note: parameters have been changed from the original paper from 0.5, 5 to
  # the values below
  # this is because the model tends to make errors in class predictions too often
  # lowering the parameter values for no object and coordinate allows more focus on
  # class error
  lambda_no_obj = 0.01
  lambda_coord = 0.5

  '--------------------------------------------------------------------------------------------------------------------------------------------------------'
  #Object Loss
  target = y_true[..., 0]


  y_pred_extract = tf.gather_nd(y_pred, tf.where(target[:] == 1))
  y_target_extract = tf.gather_nd(y_true, tf.where(target[:] == 1))

  rescaler = tf.where(target[:] == 1) * 32

  upscaler_1 = tf.concat([rescaler[:,1:], tf.zeros([len(rescaler),2], dtype = tf.int64)], axis =-1)

  target_upscaler_2 = tf.repeat([[32., 32., 224., 224.]], repeats = [len(rescaler)], axis = 0) * tf.cast(y_target_extract[...,1:5], dtype = tf.float32)

  pred_1_upscaler_2 = tf.repeat([[32., 32., 224., 224.]], repeats = [len(rescaler)], axis = 0) * tf.cast(y_target_extract[...,1:5], dtype = tf.float32)

  target_orig = tf.cast(upscaler_1, dtype = tf.float32) + target_upscaler_2
  pred_1_orig = tf.cast(upscaler_1, dtype = tf.float32) + pred_1_upscaler_2


  mask = tf.cast(compute_IOU(target_orig, pred_1_orig), dtype = tf.int32)

  y_pred_joined = tf.transpose(tf.concat([tf.expand_dims(y_pred_extract[...,0], axis=0)], axis=0))

  obj_pred = tf.gather_nd(y_pred_joined, tf.stack([tf.range(len(rescaler)), mask], axis = -1))

  object_loss = difference(tf.cast(obj_pred, dtype = tf.float32), tf.cast(tf.ones([len(rescaler)]), dtype = tf.float32))
  '--------------------------------------------------------------------------------------------------------------------------------------------------------'
  # No object loss
  y_pred_extract = tf.gather_nd(y_pred[...,0: 5], tf.where(target[:] == 0))
  y_target_extract = tf.zeros(len(y_pred_extract))

  no_object_loss_1 = difference(tf.cast(y_pred_extract[...,0], dtype = tf.float32), tf.cast(y_target_extract, dtype = tf.float32))

  no_object_loss = no_object_loss_1
  '--------------------------------------------------------------------------------------------------------------------------------------------------------'
  # Object Class Loss
  y_pred_extract = tf.gather_nd(y_pred[..., B * 5: ], tf.where(target[:] == 1))
  class_extract = tf.gather_nd(y_true[..., 5:], tf.where(target[:] == 1))

  class_loss = difference(tf.cast(y_pred_extract, dtype = tf.float32), tf.cast(class_extract, dtype = tf.float32))
  '--------------------------------------------------------------------------------------------------------------------------------------------------------'
  # For object bounding box loss
  y_pred_extract = tf.gather_nd(y_pred[...,0: B * 5], tf.where(target[:] == 1))
  centre_joined = tf.stack([y_pred_extract[...,1:3]], axis = 1)

  centre_pred = tf.gather_nd(centre_joined, tf.stack([tf.range(len(rescaler)), mask], axis =-1))
  centre_target = tf.gather_nd(y_true[...,1:3], tf.where(target[:] == 1))

  centre_loss = difference(centre_pred, centre_target)

  size_joined = tf.stack([y_pred_extract[...,3:5]],axis = 1)

  size_pred = tf.gather_nd(size_joined, tf.stack([tf.range(len(rescaler)), mask], axis =-1))
  size_target = tf.gather_nd(y_true[...,3:5], tf.where(target[:] == 1))

  size_loss = difference(tf.math.sqrt(tf.math.abs(size_pred)), tf.math.sqrt(tf.math.abs(size_target)))

  box_loss = centre_loss + size_loss
  '--------------------------------------------------------------------------------------------------------------------------------------------------------'

  loss = object_loss + (lambda_no_obj * no_object_loss) + tf.cast(lambda_coord * box_loss, dtype = tf.float32) + tf.cast(class_loss, dtype = tf.float32)
  return loss

In [None]:
checkpoint_filepath = '/content/drive/MyDrive/ASL_project/yolo_mobilenet.h5'


callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = True,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True
)


In [None]:
def scheduler(epoch, lr):
  if epoch < 10:
    return 5e-5
  elif 10<=epoch<20:
    return 1e-5
  else:
    return 1e-6

In [None]:
lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(
    loss = yolo_loss,
    optimizer = optimizer
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
history = model.fit(
    train_dataset,
    validation_data = val_dataset,
    verbose = 1,
    epochs = 30,
    callbacks = [lr_callback, callback]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
test_images = np.load('testx.npy')
test_labels = np.load('testy.npy')
print(test_images.shape)
print(test_labels.shape)
test_bboxes = find_relative_bboxes(test_labels)
print(test_bboxes.shape)
test_output = generate_output(test_bboxes, test_labels)
print(test_output.shape)

(100, 224, 224, 3)
(100, 31)
(100, 4)
(100, 7, 7, 31)


In [None]:
base_model = tf.keras.applications.MobileNet(
    weights = 'imagenet',
    input_shape = (H, W, 3),
    include_top = False
)
base_model.trainable = True

model = tf.keras.Sequential([
    base_model,
    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Conv2D(NUM_FILTERS, (3,3), padding = 'same', kernel_initializer = 'he_normal'),
    LeakyReLU(alpha = 0.1),

    Flatten(),


    Dense(NUM_FILTERS, kernel_initializer = 'he_normal'),
    BatchNormalization(),
    LeakyReLU(alpha = 0.1),

    Dropout(0.5),
    Dense(SPLIT_SIZE * SPLIT_SIZE * OUTPUT_DIM, activation = 'sigmoid'),
    Dropout(0.5),
    Reshape((SPLIT_SIZE, SPLIT_SIZE, OUTPUT_DIM)),

])

model.load_weights('/content/drive/MyDrive/ASL_project/yolo_mobilenet.h5')



optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(
    loss = yolo_loss,
    optimizer = optimizer,
)
