**We mount our data from the google -Drive-repository.**

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)

Mounted at /content/drive


**Load the modules we need.**

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import Conv3D, MaxPooling3D,Conv2D,AveragePooling2D,AveragePooling3D
from keras.layers import Dense, GlobalAveragePooling3D,GlobalAveragePooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler,ReduceLROnPlateau
from keras.optimizers import SGD, RMSprop, Adadelta
from keras.utils import np_utils, generic_utils
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional_recurrent import ConvLSTM2D

import theano
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import preprocessing

# image specification
img_rows,img_cols=125, 57 
patch_size = 16

# Our gestures

We only used 6 gestures, so the training wouldn't take forever.
Notice, that the order is important, since we place the labels accordingly

In [None]:
gestures = "Rolling Hand Backward", "Rolling Hand Forward", "Stop", "Swiping Left", "Swiping Right", "No Gesture"

number_gestures = len(gestures)

**The following cells will probably be of no use anymore since we load in the jester-set later and we now are able to load in the jester-set.**

Here we just use the few gesture-videos we produced ourselves. Not of much use anymore since they're only 18ish.

In [None]:
from tqdm import tqdm

gestures_count = np.zeros(number_gestures, dtype = int) # Count the number of each gesture

X_tr=[]           # variable to store entire dataset
# We now load in all picture-blocks at once.
for gesture_index in range(number_gestures):
  ls_path = os.path.join("/content/drive/MyDrive/generated_data/", gestures[gesture_index])
  listing = os.listdir(ls_path)

  for ls in tqdm(listing):
    listing_stop = sorted(os.listdir(os.path.join(ls_path,ls))) 

    frames = []
    img_depth=0
    for imgs in listing_stop:
      if img_depth <16:
        img = os.path.join(os.path.join(ls_path,ls),imgs)
        frame = cv2.imread(img)
        frame=cv2.resize(frame,(img_rows,img_cols),interpolation=cv2.INTER_AREA)
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(gray)
        img_depth=img_depth+1
      else:
        break
    input_img = np.array(frames)
    ipt=np.rollaxis(np.rollaxis(input_img,2,0),2,0)
    ipt=np.rollaxis(ipt,2,0)
    X_tr.append(ipt)
    gestures_count[gesture_index] += 1

100%|██████████| 3/3 [00:19<00:00,  6.45s/it]
100%|██████████| 4/4 [00:25<00:00,  6.32s/it]
100%|██████████| 2/2 [00:13<00:00,  6.55s/it]
100%|██████████| 3/3 [00:19<00:00,  6.36s/it]
100%|██████████| 4/4 [00:25<00:00,  6.26s/it]
100%|██████████| 2/2 [00:12<00:00,  6.13s/it]


In [None]:
X_tr_array = np.array(X_tr)   # convert the frames read into array
print(X_tr_array.shape)
#(num_samples, img_cols, img_rows ,num_pixels , num_colours)

num_samples = len(X_tr_array) 

(18, 16, 57, 125, 3)


In [None]:
label=np.zeros(num_samples, dtype = int)


## iterate through the gestures to create labels
index = 0                             # run index
for i in range(number_gestures):      # gestures
  for j in range(gestures_count[i]):  # each video
    label[index] = i                  # assign value
    index += 1                        # adjust index
print(label)



[0 0 0 1 1 1 1 2 2 3 3 3 4 4 4 4 5 5]


In [None]:
img_depth = 16
train_data = [X_tr_array,label]

(X_train, y_train) = (train_data[0],train_data[1])
print('X_Train shape:', X_train.shape)

train_set = np.zeros((num_samples, img_depth, img_cols,img_rows,3))

for h in range(num_samples):
    train_set[h][:][:][:][:]=X_train[h,:,:,:]
  

patch_size = 16    # img_depth or number of frames used for each video

print(train_set.shape, 'train samples')


X_Train shape: (18, 16, 57, 125, 3)
(18, 16, 57, 125, 3) train samples


In [None]:
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, number_gestures)
print(Y_train)

[[1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


In [None]:
# Pre-processing
train_set = train_set.astype('float32')
print(np.mean(train_set))             # locate around 0
train_set -= np.mean(train_set)
print(np.max(train_set))              # adjust to range
train_set /=np.max(train_set)

112.09022
142.90979


# Our Model

The model we used is already pretrained with 600 videos trained provided by the Jester-Dataset.
We only have the Dense layers all followed by a dropout-layer. Everything before is just prepreprocessing of the images.

The input for each prediction have to be 16 pictures concatenated by the way shown below.

The code in the end allows to change the trainable layers by changing the attribute layer.trainable = True.

Furthermore there is the possibility to change the Dropout-rate since we noticed the pretrained model to have a pretty huge dropout-rate.

In [None]:
import keras
# Define model
#high resolution network
from keras import regularizers
weight_decay = 0.005
l2=keras.regularizers.l2
patch_size = 16
## We'll extract one layer to train it.
name_layer_1 = "dense_01"
name_layer_2 = "dense_02"
name_layer_3 = "dense_03"

# To tweek the dropout-rate
name_dropout_1 = "dropout_01"
name_dropout_2 = "dropout_02"

model = Sequential()
model.add(Conv3D(4,(3,7,7),
                 input_shape=(patch_size, img_cols, img_rows, 3),
                 activation='relu',bias_initializer='ones'))
model.add(MaxPooling3D(pool_size=(1, 2, 2)))
model.add(Conv3D(8,(3,5,5), activation='relu',bias_initializer='ones'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(Conv3D(32,(3,5,5), activation='relu',bias_initializer='ones'))
model.add(MaxPooling3D(pool_size=(1, 1, 2)))
model.add(Conv3D(64,(3,3,5), activation='relu',bias_initializer='ones'))
model.add(MaxPooling3D(pool_size=(1,2, 2)))

model.add(Flatten())
model.add(Dense(512, activation='relu',bias_initializer='ones', name=name_layer_1))
model.add(Dropout(0.5,name = name_dropout_1))

model.add(Dense(256, activation='relu',bias_initializer='ones', name=name_layer_2))
model.add(Dropout(0.5,name = name_dropout_2))

model.add(Dense(number_gestures,kernel_initializer='normal', name=name_layer_3))

model.add(Activation('softmax'))

# Added weights
weight_path = r"/content/drive/MyDrive/WS_Gesture-Recognition-with-3DCNN/save_model/3DCNN_HRN_300_6_jester"
model.load_weights(weight_path)

# Only the first Dense layer is trainable
for layer in model.layers:
    layer.trainable = False


## CHOOSE THE LAYER YOU WANT TO TRAIN (name_layer_x)
training_layer = model.get_layer(name_layer_1)
training_layer.trainable = True


# If you want to change the dropout rate, do this by
dropout_layer = model.get_layer(name_dropout_1) # x being 1 or 2
dropout_layer.rate = 0.5


model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d (Conv3D)              (None, 14, 51, 119, 4)    1768      
_________________________________________________________________
max_pooling3d (MaxPooling3D) (None, 14, 25, 59, 4)     0         
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 12, 21, 55, 8)     2408      
_________________________________________________________________
max_pooling3d_1 (MaxPooling3 (None, 6, 10, 27, 8)      0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 4, 6, 23, 32)      19232     
_________________________________________________________________
max_pooling3d_2 (MaxPooling3 (None, 4, 6, 11, 32)      0         
_________________________________________________________________
conv3d_3 (Conv3D)            (None, 2, 4, 7, 64)       9

I will remove this Cell since we don't use it. Probably is a stopping-device, if it doesn't train good enough

In [None]:
earlystop = EarlyStopping(monitor='val_loss', patience=50, verbose =1)

**Compile the model**

In [None]:
sgd = SGD(lr=0.001,  momentum=0.9, nesterov=False)
rms = RMSprop(decay=1e-6)
ada = Adadelta(lr=0.1,decay=1e-6)
model.compile(loss='categorical_crossentropy', 
              optimizer=sgd,
              #optimizer=ada,
              metrics=['acc'])




  "The `lr` argument is deprecated, use `learning_rate` instead.")


# Split the train_set into Training and Testing

In [None]:
# WE SPLIT INTO TRAIN- AND TESTDATA
X_train_new, X_val_new, y_train_new,y_val_new = train_test_split(train_set, Y_train, test_size=0.2, random_state=20)
print(X_train_new.shape)
print(type(X_train_new))

(14, 16, 57, 125, 3)
<class 'numpy.ndarray'>


# Save the weights
Here we can save and reload the weights produced by our training.
First block for saving, second block for loading

In [None]:
# THIS BOX IS TO SAVE OUR WEIGHTS
save_path = "/content/drive/MyDrive/WS_Gesture-Recognition-with-3DCNN/save_model/trained"
for i in range(100):
  save_path_new = os.path.join(save_path, str(i))       # We go through each possible weight-file-name
  if not os.path.isfile(save_path_new):
    model.save_weights(save_path_new)
    break

In [None]:
# RELOAD OUR LATEST WEIGHTS BY RUNNING THIS BOX:
save_path = "/content/drive/MyDrive/WS_Gesture-Recognition-with-3DCNN/save_model/trained"
for i in range(100):
  save_path_new = os.path.join(save_path, str(100-i))   # We go through each possible weight-file-name BACKWARDS
  if os.path.isfile(save_path_new):
    model.load_weights(save_path_new)
    break

# Will remove this as well, if everything works
This is the little data-set we used in the beginning

In [None]:
# CREATE THE TEST-SET X_val FROM JESTER

X_val = []           # variable to store entire dataset
# We now load in all picture-blocks at once.
ls_path = os.path.join("/content/drive/MyDrive/", "training_samples7")
listing = os.listdir(ls_path)

i = 1
for ls in tqdm(listing):
  if i < 10:        # How many images are loaded into X_val
    listing_stop = sorted(os.listdir(os.path.join(ls_path,ls))) 

    frames = []
    img_depth=0
    for imgs in listing_stop:
      if img_depth <16:
        img = os.path.join(os.path.join(ls_path,ls),imgs)
        frame = cv2.imread(img)
        frame=cv2.resize(frame,(img_rows,img_cols),interpolation=cv2.INTER_AREA)
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(gray)
        img_depth=img_depth+1
      else:
        break
    input_img = np.array(frames)
    ipt=np.rollaxis(np.rollaxis(input_img,2,0),2,0)
    ipt=np.rollaxis(ipt,2,0)
    X_val.append(ipt)
    i += 1


X_val_array = np.array(X_val)   # convert the frames read into array



NameError: ignored

Here you can predict some gestures

In [None]:
test_pred = model.predict(X_val_array[0:9])
result = np.argmax(test_pred, axis =1)
print(result)

[2 1 4 2 2 1 3 4 4]


# Validation

Here we load the videos we produced ourselves. These are about 10*6 Videos. We wanted to produce videos for each gesture in the same scenario, so we could see the problems of our model better.

In [None]:
# CREATE THE VALIDATION SET
# Val will be in the form:
# 1_geste_0, 2_geste_0,..., 1_geste_1, 2_geste_1,...,1_geste_2,2_geste2,... 
Val = []
gesture_begin = 1 # We take 16 pictures from gesture_begin onwards

ls_path = os.path.join("/content/drive/MyDrive", "validation")
listing = os.listdir(ls_path)
number_scenes = len(listing)            # Count, how many video-scenarios there are
Val_y = np.zeros(6*number_scenes)       # Hopefully, we have 6 videos in each video-scenario # Labels
for i in range(6):                      # Go through each gesture
    for j in range(number_scenes):        # Go through each scene
      Val_y[i*number_scenes + j] = i      # We go through gesture after gesture and create the labels
      video_path = os.path.join(ls_path, str(j+1), str(j+1) + "_" + "geste" + "_" + str(i)) # These are the videos
      frames = []
      img_depth = 0
      for imgs in sorted(os.listdir(video_path)):
        if gesture_begin <= img_depth and img_depth < gesture_begin + 16:
          img = os.path.join(video_path, imgs)
          frame = cv2.imread(img)
          frame = cv2.resize(frame, (img_rows, img_cols), interpolation=cv2.INTER_AREA)
          gray = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
          frames.append(gray)
          img_depth += 1
        elif img_depth < gesture_begin:
          img_depth += 1
        else:
          break
      input_img = np.array(frames)
      ipt = np.rollaxis(np.rollaxis(input_img,2,0),2,0)
      ipt = np.rollaxis(ipt,2,0)
      Val.append(ipt)



Val_array = np.array(Val)   # convert the frames read into array

# Produce the prediction-array

Every row stands for one gesture, every column for one scenario (same person, same background, same shirt).

So the perfect prediction would be:

[[ 0 0 0 0 0 0...]

[1 1 1 1 1 1 ...]

[2 2 2 2 2 2 ...]

...

[5 5 5 5 5 5 ...] ]

In [None]:
val_pred = model.predict(Val_array[:])
result_val = np.argmax(val_pred, axis=1)
result_val = result_val.reshape(6, number_scenes)
print(result_val)

[[0 1 4 0 3 0 1 1]
 [0 1 4 0 1 4 1 1]
 [2 2 4 2 0 3 3 1]
 [0 0 4 3 3 3 2 3]
 [4 4 4 3 4 4 2 1]
 [2 4 2 4 5 4 1 5]]


# Check, which ones are correct
In this block, we create the correct prediction matrix and compare with the prediction made by our model.

A 1 stands for a correct prediction, a 0 for a false prediction.

In [None]:
# Check, which is correct
# AUFHÜBSCHEN
A = np.zeros([6, number_scenes])
a = np.ones(number_scenes)

for i in range(6):
  A[i,:] += i * a

check_matrix = (A == result_val)
check_matrix = check_matrix * 1
print(check_matrix)
print("Sum of all correct predictions is", np.sum(check_matrix), "of", number_scenes*number_gestures)


[[1 0 0 1 0 1 0 0]
 [0 1 0 0 1 0 1 1]
 [1 1 0 1 0 0 0 0]
 [0 0 0 1 1 1 0 1]
 [1 1 1 0 1 1 0 0]
 [0 0 0 0 1 0 0 1]]
Sum of all correct predictions is 21 of 48


# Create the best beginning

Since we only load 16 images per video, there will be a part of the video cut off. In this cell we experimentally figured out, how many images in the beginning to crop from the beginning of the videos created by ourselves so it would make the best predictions for this data-set.

In [None]:
# THIS CODE-BLOCK IS TO FIND OUT THE MOST BEST GESTURE_BEGIN_RATE
# CREATE THE VALIDATION SET
# Val will be in the form:
# 1_geste_0, 2_geste_0,..., 1_geste_1, 2_geste_1,...,1_geste_2,2_geste2,... 

first_first_frame = 0
last_first_frame = 12
count = np.zeros(last_first_frame-first_first_frame)

ls_path = os.path.join("/content/drive/MyDrive", "validation")
listing = os.listdir(ls_path)
number_scenes = len(listing)            # Count, how many video-scenarios there are


for gesture_begin in range(first_first_frame, last_first_frame):
  Val = []
  Val_y = np.zeros(6*number_scenes)       # Hopefully, we have 6 videos in each video-scenario # Labels
  for i in range(6):                      # Go through each gesture
    for j in range(number_scenes):        # Go through each scene
      Val_y[i*number_scenes + j] = i      # We go through gesture after gesture and create the labels
      video_path = os.path.join(ls_path, str(j+1), str(j+1) + "_" + "geste" + "_" + str(i)) # These are the videos
      frames = []
      img_depth = 0
      for imgs in sorted(os.listdir(video_path)):
        if gesture_begin <= img_depth and img_depth < gesture_begin + 16:
          img = os.path.join(video_path, imgs)
          frame = cv2.imread(img)
          frame = cv2.resize(frame, (img_rows, img_cols), interpolation=cv2.INTER_AREA)
          gray = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
          frames.append(gray)
          img_depth += 1
        elif img_depth < gesture_begin:
          img_depth += 1
        else:
          break
      input_img = np.array(frames)
      ipt = np.rollaxis(np.rollaxis(input_img,2,0),2,0)
      ipt = np.rollaxis(ipt,2,0)
      Val.append(ipt)


  Val_array = np.array(Val)   # convert the frames read into array
  val_pred = model.predict(Val_array[:])
  result_val = np.argmax(val_pred, axis=1)
  result_val = result_val.reshape(6, number_scenes)

  A = np.zeros([6, number_scenes])
  a = np.ones(number_scenes)

  for i in range(6):
    A[i,:] = i * a

  check_matrix = (A == result_val)
  check_matrix = check_matrix * 1
  count[gesture_begin-first_first_frame] = np.sum(check_matrix)

print(count)
print("Best begin seems to be", np.argmax(count)+first_first_frame)
print("So we have", np.max(count), "correct predictions")


[19. 21. 20. 19. 19. 19. 18. 17. 14. 16. 14. 10.]
Best begin seems to be 1
So we have 21.0 correct predictions


# Create our training set

Here we loaded some 

In [None]:

def get_labels_from_csv(csv_path):
  gestures = ["Rolling Hand Backward", "Rolling Hand Forward", "Stop Sign", "Swiping Left", "Swiping Right", "No gesture"]

  with open(csv_path) as csv_file:
    labels = []
    csv_reader = csv.reader(csv_file)
    i = 0
    for line in csv_reader:
      if i:
        if line:
          # print(line)
          if line[1] in gestures:
            labels.append([line[0], gestures.index(line[1])])  
      i+=1
    return labels


In the following Code-Block, we use the extracted datafiles from Jester and load them into the variable X_val. 

The Labels to our Data, we get by the function get_labels_from_cv províded in the cell above. 

In [None]:
# CREATE THE Training-SET X_val FROM JESTER

import csv
X_val = []           # variable to store entire dataset
Y_val = []           # Create the labels
ls_path = "/content/drive/MyDrive/training_samples7"
ls = get_labels_from_csv("/content/drive/MyDrive/training_samples8train.csv")

num_samples = len([row[0] for row in ls])
for i in range(num_samples):
  listing_stop = sorted(os.listdir(os.path.join(ls_path,ls[i][0]))) # Die einzelnen Bilder
  # print("Listing_stop sieht so aus: ", listing_stop)
  # print("Listeneintrag", ls[i][0])
  # print(listing_stop) 
  frames = []
  img_depth=0
  for imgs in listing_stop:
    # print(listing_stop)
    # print("imgs ist ",  imgs)
    # print("die datei, die man einladen will heißt ", os.path.join(os.path.join(ls_path,ls[i][0]), imgs))
    if img_depth <16:
      img = os.path.join(os.path.join(ls_path,ls[i][0]), imgs)
      # print(img)
      frame = cv2.imread(img)
      # print(frame)
      frame = cv2.resize(frame,(img_rows,img_cols),interpolation=cv2.INTER_AREA)
      gray = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      frames.append(gray)
      img_depth=img_depth+1
    else:
      break
  input_img = np.array(frames)
  ipt=np.rollaxis(np.rollaxis(input_img,2,0),2,0)
  ipt=np.rollaxis(ipt,2,0)
  X_val.append(ipt)
  Y_val.append(ls[i][1])


X_val_array = np.array(X_val)   # convert the frames read into array
Y_val_array = np.array(Y_val)

In [None]:

# NEXT CELL
img_depth = 16
train_data = [X_val_array,Y_val_array]

# print(X_val_array.shape, Y_val_array.shape)
(X_train, y_train) = (train_data[0],train_data[1])
# print(X_train.shape, y_train)

train_set = np.zeros((num_samples, img_depth, img_cols,img_rows,3))

for h in range(num_samples):
    train_set[h][:][:][:][:]=X_train[h,:,:,:]
  
patch_size = 16    # img_depth or number of frames used for each video


# NEXT CELL
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, number_gestures)


# NEXT CELL
# Pre-processing
train_set = train_set.astype('float32')
print(np.mean(train_set))             # locate around 0
train_set -= np.mean(train_set)
print(np.max(train_set))              # adjust to range
train_set /=np.max(train_set)


# NEXT CELL
# WE SPLIT INTO TRAIN- AND TESTDATA
X_train_new, X_val_new, y_train_new,y_val_new = train_test_split(train_set, Y_train, test_size=0.1, random_state=20)
print(X_train_new.shape)

108.3591
146.6409
(1019, 16, 57, 125, 3)


#Train the model

In [None]:
batch_size = 200
nb_epoch = 100
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.00001, 
                               cooldown=0, patience=2, min_lr=0.005/(2^4))

hist = model.fit(
    X_train_new,
    y_train_new,
    validation_data=(X_val_new,y_val_new),
    batch_size=batch_size,
    epochs = nb_epoch,
    shuffle=True,
    )


1/6 [====>.........................] - ETA: 2:48 - loss: 0.7480 - acc: 0.6950

In [None]:
training_loss = hist.history['loss']
val_loss = hist.history['val_loss']

plt.plot(training_loss, label="training_loss")
plt.plot(val_loss, label="validation_loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Learning Curve")
plt.legend(loc='best')
plt.show()

In [None]:
training_acc = hist.history['acc']
val_acc = hist.history['val_acc']

plt.plot(training_acc, label="training_accuracy")
plt.plot(val_acc, label="validation_accuracy")
plt.xlabel("Epochs")
plt.ylabel("accuracy")
plt.title("Learning Curve")
plt.legend(loc='best')
plt.show()