# 1. Imports

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from math import inf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.callbacks import TensorBoard
import json
import shutil
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

2022-12-16 23:41:18.957048: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/electric/anaconda3/envs/ASL_SA/lib/python3.8/site-packages/cv2/../../lib64:
2022-12-16 23:41:18.957084: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# 2. Data preprocessing

## 2.1. Extracting videos of signs

In [10]:
#Load the json file
json_file = 'WLASL_v0.3.json'
if os.path.exists(json_file) == False:
    raise('The json file does not exist!')
with open(json_file) as f:
    signs = json.load(f)

#Check the original videos dir exits
old_dir = 'videos'
if os.path.exists(old_dir) == False:
    raise('The original videos directory does not exist!')
    
#create the labeled parent dir if it does not exist
new_dir = 'labeledVids'
if os.path.exists(new_dir) == False:
    os.mkdir(new_dir)

#loop thru different signs
missing_vids = 0
for sign in signs:

    #extract sign name
    sign_name = sign['gloss']

    #Directory of the sign
    sign_dir = os.path.join(new_dir, sign_name)

    #Create a subdir of sign_name under the new videos directory, if it does not exist
    if os.path.exists(os.path.join(new_dir, sign_name)) == False:
        os.mkdir(os.path.join(new_dir, sign_name))

    #loop thru vids and copy them from old_dir/ to new/sign_name/
    for vid in sign['instances']:

        #video old and new paths
        vid_name = vid['video_id'] + '.mp4'
        vid_old_path = os.path.join(old_dir, vid_name)
        vid_new_path = os.path.join(new_dir, sign_name, vid_name)

        #check if video exists in old path
        if os.path.exists(vid_old_path) == False:
            missing_vids += 1
            continue

        #check if video exists in the new path
        if os.path.exists(vid_new_path):
            continue

        #Add the video to the new dir if it exists in the old and not in the new
        shutil.copyfile(vid_old_path, vid_new_path)

#remove signs that have less than 4 videos
for sign in os.listdir(new_dir):
    sign_dir = os.path.join(new_dir, sign)
    if len(os.listdir(sign_dir)) < 4:
        os.rmdir(sign_dir)

## 2.2. Statistics about data

In [11]:
mn, mx = inf, -inf
f20t40 = 0
f25t35 = 0
totalVids = 0
frames = []

# loop over every sign.
for sign in os.listdir(new_dir):

    # directory of the sign in the local os.
    sign_dir = os.path.join(new_dir, sign)

    # loop over every video for a given sign.
    for vid in os.listdir(sign_dir):

        # path of the vid.
        vid_path = os.path.join(sign_dir, vid)

        # open video resource.
        cap = cv2.VideoCapture(vid_path)

        # if opened successfully.
        if cap.isOpened():

            # increment number of total videos.
            totalVids += 1

            # min and max number of frames.
            mn = min(mn, int(cap. get(cv2. CAP_PROP_FRAME_COUNT)))
            mx = max(mx, int(cap. get(cv2. CAP_PROP_FRAME_COUNT)))

            # counting number of frames in a given range.
            if 20<=cap.get(cv2. CAP_PROP_FRAME_COUNT) and cap.get(cv2. CAP_PROP_FRAME_COUNT)<=40: f20t40 += 1
            if 25<=cap.get(cv2. CAP_PROP_FRAME_COUNT) and cap.get(cv2. CAP_PROP_FRAME_COUNT)<=35: f25t35 += 1

            # frames distribution of all videos.
            frames.append(cap.get(cv2. CAP_PROP_FRAME_COUNT))

        # release the video resource.
        cap.release()

# display results.
print(f'The minimum number of frames a video has = {mn} frames')
print(f'The maximum number of frames a video has = {mx} frames')
print('')
print(f'Number of videos containing 20 to 40 frames = {f20t40}')
print(f'Number of videos containing 25 to 35 frames = {f25t35}')

The minimum number of frames a video has = 16 frames
The maximum number of frames a video has = 195 frames

Number of videos containing 20 to 40 frames = 537
Number of videos containing 25 to 35 frames = 320


# 2. Extracting Keypoints using mediapipe Holistic
The goal here is to extract and save keypoints for later use. This saves computation time during the LSTM model as it won't have to extract the keypoints for each frame itself.

## 2.1. Mediapipe Holistic functions

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

def draw_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# 2.2. Keypoints extraction

In [9]:
# directory from which to read videos of signs.
signs_dir = 'labeledVids'
if os.path.exists(signs_dir) == False:
    raise('Directory of videos does not exist!')

# directory to save the extracted keypoints
kps_dir = 'labeledKeypoints'
if os.path.exists(kps_dir) == False:
    os.mkdir(kps_dir)

# set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # for every available sign
    for sign in os.listdir(signs_dir):

        # directory of the current sign
        sign_dir = os.path.join(signs_dir, sign)

        # keypoints directory of the sign
        kp_sign_dir = os.path.join(kps_dir, sign)
        if os.path.exists(kp_sign_dir) == False:
            os.mkdir(kp_sign_dir)


        # for every video sequence in the sign directory
        for vid in os.listdir(sign_dir):

            # path of the vid relative to the parent directory of signs_dir
            vid_path = os.path.join(sign_dir, vid)

            # keypoints directory of the vid
            kp_vid_dir = os.path.join(kp_sign_dir, vid)
            if os.path.exists(kp_vid_dir) == False:
                os.mkdir(kp_vid_dir)
            
            # capturing the video
            cap = cv2.VideoCapture(vid_path)

            if cap.isOpened() == False:
                raise('could not capture video')

            # counter of the frames
            frame_num = 0

            # for each frame in the video
            while cap.isOpened():

                # Read feed
                ret, frame = cap.read()
                if ret == False:
                    break

                # increment frame number
                frame_num += 1

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
                
                # save keypoints to save computation time in the future.
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(kp_vid_dir, str(frame_num))
                np.save(npy_path, keypoints)
            
            # release the video capture
            cap.release()

    cv2.destroyAllWindows()

[h264 @ 0x557f661ced80] Invalid NAL unit size (745 > 472).
[h264 @ 0x557f661ced80] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557f6657c8c0] stream 1, offset 0x3b468: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557f6657c8c0] stream 1, offset 0x3b7d3: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557f6657c8c0] stream 1, offset 0x3c9b9: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557f6657c8c0] stream 1, offset 0x3cd00: partial file
[h264 @ 0x557f65bcf580] Invalid NAL unit size (71678 > 10776).
[h264 @ 0x557f65bcf580] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x557f6653d6c0] stream 1, offset 0x2a27a7: partial file


# 3. The LSTM model

## 3.1. Model inputs

In [2]:
# encoding of signs
signs_dir = 'labeledKeypoints2'
signs = os.listdir(signs_dir)
sign_map = {sign:num for num, sign in enumerate(signs)}

# generating inputs
vids, labels = [], []
for sign in os.listdir(signs_dir):
    
    sign_dir = os.path.join(signs_dir, sign)

    for vid in os.listdir(sign_dir):
        window = []
        vid_dir = os.path.join(sign_dir, vid)

        for frame in os.listdir(vid_dir):
            frame_path = os.path.join(vid_dir, frame)
            res = np.load(frame_path)
            window.append(res)

        vids.append(window)
        labels.append(sign_map[sign])

In [3]:
X = pad_sequences(vids, dtype='float')
y = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

## 3.2. Model architecture

In [4]:
#for saving logs
log_dir = os.path.join('logs')
if os.path.exists(log_dir) == False:
    os.mkdir(log_dir)
tb_callback = TensorBoard(log_dir=log_dir)

# model
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(None,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(sign_map), activation='softmax'))

2022-12-16 23:41:46.124918: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-12-16 23:41:46.124951: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2022-12-16 23:41:46.133201: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-12-16 23:41:46.185973: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-12-16 23:41:46.197261: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-12-16 23:41:46.212916: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-12-16 23:41:46.212948: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (predator): /proc/driver/nvidia/version does not exist
2022-12-16 23:41:46.21365

In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 64)          442112    
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 128)         98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 108)               3564      
Total params: 600,140
Trainable params: 600,140
Non-trainable params: 0
__________________________________________________

## 3.3. Model training

In [6]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=100, callbacks=[tb_callback])

2022-12-16 23:41:56.598279: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 882462168 exceeds 10% of free system memory.
2022-12-16 23:41:57.106504: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-12-16 23:41:57.131429: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz


Epoch 1/100
 1/29 [>.............................] - ETA: 1:14 - loss: 4.6756 - categorical_accuracy: 0.0000e+00

2022-12-16 23:41:59.921614: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-12-16 23:41:59.921665: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


 2/29 [=>............................] - ETA: 9s - loss: 16.1799 - categorical_accuracy: 0.0000e+00 

2022-12-16 23:42:00.206582: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2022-12-16 23:42:00.306279: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2022-12-16 23:42:00.455009: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: logs/train/plugins/profile/2022_12_16_23_42_00
2022-12-16 23:42:00.540196: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to logs/train/plugins/profile/2022_12_16_23_42_00/predator.trace.json.gz
2022-12-16 23:42:00.589829: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: logs/train/plugins/profile/2022_12_16_23_42_00
2022-12-16 23:42:00.589933: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for memory_profile.json.gz to logs/train/plugins/profile/2022_12_16_23_42_00/predator.memory_profile.json.gz
2022-12-16 23:42:00.591336: I tensorflo

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<tensorflow.python.keras.callbacks.History at 0x7f32e848dfd0>

## 3.4. Model Evaluation

In [7]:
y_predicted = model.predict(X_test)
y_true = np.argmax(y_test, axis=1).tolist()
y_predicted = np.argmax(y_predicted, axis=1).tolist()

### 3.4.1. Confusion matrix

In [8]:
multilabel_confusion_matrix(y_true, y_predicted)

array([[[ 0., 48.],
        [ 0.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[45.,  0.],
        [ 3.,  0.]],

       [[46.,  0.],
        [ 2.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[46.,  0.],
        [ 2.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1.,  0.]],

       [[47.,  0.],
        [ 1., 

### 3.4.2. Accuracy

In [10]:
accuracy_score(y_true, y_predicted)

0.0