In [3]:
!pip install opencv-python numpy tensorflow scikit-learn matplotlib larq wandb albumentations

Collecting albumentations
  Obtaining dependency information for albumentations from https://files.pythonhosted.org/packages/40/01/4202bd81ab337dca5693d7d1cb25c8e9041d97762aee738a24382ff9af2f/albumentations-1.4.3-py3-none-any.whl.metadata
  Downloading albumentations-1.4.3-py3-none-any.whl.metadata (37 kB)
Collecting scikit-image>=0.21.0 (from albumentations)
  Obtaining dependency information for scikit-image>=0.21.0 from https://files.pythonhosted.org/packages/0e/6e/cae83e24d1c62aacb8facb9e3325d3b9454f3374d42ead5e6caae4753048/scikit_image-0.23.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_image-0.23.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting typing-extensions>=3.6.6 (from tensorflow)
  Obtaining dependency information for typing-extensions>=3.6.6 from https://files.pythonhosted.org/packages/01/f3/936e209267d6ef7510322191003885de524fc48d1b43269810cd589ceaf5/typing_extensions-4.11.0-py3-none-any.

In [4]:
#get all the required dependencies of the project
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import random
import wandb

from typing import Tuple
from collections import Counter
import albumentations as A
from tqdm import tqdm
from typing import List
import json
import tensorflow.keras as keras 
import tensorflow as tf

2024-04-14 23:03:10.288223: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 23:03:10.288319: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 23:03:10.288367: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-14 23:03:10.298668: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def frame_diff(prev_frame, current_frame, next_frame):
    if prev_frame is None or current_frame is None or next_frame is None:
        return None
    
    #standarized frame.
    prev_frame = cv2.resize(prev_frame, (640, 480))
    current_frame = cv2.resize(current_frame, (640, 480))
    next_frame = cv2.resize(next_frame, (640, 480))
    
    #getting the absolute difference between current frame and next frame.
    diff1 = cv2.absdiff(next_frame, current_frame)
    
    #absolute diff between previous and current frame.
    diff2 = cv2.absdiff(current_frame, prev_frame)
    
    #bitwise AND operation to obtain common region of motion
    motion_diff = cv2.bitwise_and(diff1, diff2)
    
    return motion_diff

In [6]:
#code that loops through folders to get actions
import os
import numpy as np

# Path to the dataset folder containing subfolders for each action
dataset_folder = "asl_dataset_word/archive (1)/extracted_videos_asl"  # Update this to your dataset directory

# Path for exported data (not used for video processing here, but kept for structure)
DATA_PATH = os.path.join('FD_FLAT_ASL_DATA') 

# Ensure DATA_PATH exists
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

# List to hold the names of actions based on folder names in the dataset_folder
actions = []

# Populate actions list with the names of directories in DATA_PATH
for entry in os.listdir(DATA_PATH):
    if os.path.isdir(os.path.join(DATA_PATH, entry)):
        actions.append(entry)

# Convert the list of actions to a numpy array (optional, depending on further use)
actions = np.array(actions)

# Print detected actions
print(f"Detected actions: {actions}")

Detected actions: ['about' 'accident' 'africa' 'again' 'all' 'always' 'animal' 'apple'
 'approve' 'argue' 'arrive' 'baby' 'back' 'backpack' 'bad' 'bake'
 'balance' 'ball' 'banana' 'bar' 'basketball' 'bath' 'bathroom' 'beard'
 'because' 'bed' 'before' 'behind' 'bird' 'birthday' 'black' 'blanket'
 'blue' 'book' 'bowling' 'boy' 'bring' 'brother' 'brown' 'business' 'but'
 'buy' 'call' 'can' 'candy' 'careful' 'cat' 'catch' 'center' 'cereal'
 'chair' 'champion' 'change' 'chat' 'cheat' 'check' 'cheese' 'children'
 'christmas' 'city' 'class' 'clock' 'close' 'clothes' 'coffee' 'cold'
 'college' 'color' 'computer' 'convince' 'cook' 'cool' 'copy' 'corn'
 'cough']


In [7]:
actions.shape

(75,)

## Preprocess data - create labels and features

In [8]:
#train_test is a function to split dataset into training and testing set
from sklearn.model_selection import train_test_split

#Converts class vectors to binary class matrix for categorial crossentropy
from tensorflow.keras.utils import to_categorical

In [9]:
#Create Label Map
label_map = {label:num for num, label in enumerate(actions)}

In [10]:
label_map

{'about': 0,
 'accident': 1,
 'africa': 2,
 'again': 3,
 'all': 4,
 'always': 5,
 'animal': 6,
 'apple': 7,
 'approve': 8,
 'argue': 9,
 'arrive': 10,
 'baby': 11,
 'back': 12,
 'backpack': 13,
 'bad': 14,
 'bake': 15,
 'balance': 16,
 'ball': 17,
 'banana': 18,
 'bar': 19,
 'basketball': 20,
 'bath': 21,
 'bathroom': 22,
 'beard': 23,
 'because': 24,
 'bed': 25,
 'before': 26,
 'behind': 27,
 'bird': 28,
 'birthday': 29,
 'black': 30,
 'blanket': 31,
 'blue': 32,
 'book': 33,
 'bowling': 34,
 'boy': 35,
 'bring': 36,
 'brother': 37,
 'brown': 38,
 'business': 39,
 'but': 40,
 'buy': 41,
 'call': 42,
 'can': 43,
 'candy': 44,
 'careful': 45,
 'cat': 46,
 'catch': 47,
 'center': 48,
 'cereal': 49,
 'chair': 50,
 'champion': 51,
 'change': 52,
 'chat': 53,
 'cheat': 54,
 'check': 55,
 'cheese': 56,
 'children': 57,
 'christmas': 58,
 'city': 59,
 'class': 60,
 'clock': 61,
 'close': 62,
 'clothes': 63,
 'coffee': 64,
 'cold': 65,
 'college': 66,
 'color': 67,
 'computer': 68,
 'convince'

In [11]:
#Padding for uniform data - USE THIS!
DATA_PATH = "FD_FLAT_ASL_DATA"
sequences, labels = [], []
sequence_counter = 0  # Initialize a counter for the number of sequences processed

# Assuming a predefined maximum sequence length (adjust as needed)
max_sequence_length = 150  # Change this to your maximum length requirement

for action in actions:
    action_path = os.path.join(DATA_PATH, action)
    for file_name in os.listdir(action_path):
        if file_name.endswith('.npy'):

            fd_path = os.path.join(action_path, file_name)
            fd_data = np.load(fd_path)
            
            # Check if the loaded sequence length is shorter than max_sequence_length and pad if necessary
            if fd_data.shape[0] < max_sequence_length:
                # Calculate the padding amount needed
                padding_length = max_sequence_length - fd_data.shape[0]
                # Pad with zeros - assuming fd_data is a 2D array; adjust padding shape as necessary
                padding = np.zeros((padding_length, *fd_data.shape[1:]))
                fd_data = np.vstack((fd_data, padding))
            
            sequences.append(fd_data)
            labels.append(label_map[action])  # Map the label

print(f"Total sequences processed: {sequence_counter}")

Total sequences processed: 0


In [12]:
fd_data.shape

#note each fd_data time has different length - apply normalization and zero-padding to get rid of the problem. Then
np.array(sequences).shape

(676, 150, 100, 100)

In [13]:
np.array(labels).shape

(676,)

In [14]:
X = np.array(sequences)
X.shape

(676, 150, 100, 100)

In [15]:
#from sklearn.preprocessing import LabelEncoder
#label_encoder = LabelEncoder()
#integer_encoded = label_encoder.fit_transform(labels)
y = to_categorical(labels).astype(int)
#y = to_categorical(integer_encoded)

In [16]:
np.array(labels).shape

(676,)

In [17]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [18]:
#Here, split into train, test, and dev (validate) dataset
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

# First, split into temp training and test sets
X_temp, X_dev, y_temp, y_dev = train_test_split(X, y, test_size=0.05)

# Then, split the temp training set into final training and dev (validation) sets
X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.05)

In [19]:
#look at the distribution of test and dev dataset
X_train.shape, X_test.shape, X_dev.shape

((609, 150, 100, 100), (33, 150, 100, 100), (34, 150, 100, 100))

## Build and Train CNN!
Simple architecture using LeNet

In [28]:
#import required libraries
import tensorflow as tf
import wandb

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, LeakyReLU
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import backend as K
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Conv2D, MaxPooling2D , Flatten, BatchNormalization, Reshape
from wandb.keras import WandbMetricsLogger
from tensorflow.keras.callbacks import Callback
#from tensorflow.keras.callbacks import TensorBoard

#import larq as lq larq uses tensorflow 2.3

In [29]:
#Port LeNet CNN Architecture from MHI version of the project
FD_FLAT_SHAPE = (150,100,100) 
#150,10000
model_leNet = Sequential()
#Block 1
model_leNet.add(Conv2D(filters=32, kernel_size=(5,5), padding='same', activation='relu', input_shape=FD_FLAT_SHAPE))
model_leNet.add(BatchNormalization())
model_leNet.add(Conv2D(filters=48, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(MaxPooling2D(pool_size=(2,2), strides = (2,2)))

#Block 2
model_leNet.add(Conv2D(filters=32, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(Conv2D(filters=48, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(MaxPooling2D(pool_size=(2,2), strides = (2,2)))

#Block 3
model_leNet.add(Conv2D(filters=32, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(Conv2D(filters=48, kernel_size=(5,5), padding='same', activation='relu'))

# Flatten the output and reshape for LSTM
model_leNet.add(Reshape((150, -1)))  # Reshape to (batch_size, 150, features_per_timestep)
# Add LSTM layer
# No return_sequences if you want the output from the last timestep only
model_leNet.add(LSTM(units=64, activation='relu', return_sequences=False))
model_leNet.add(Flatten())
model_leNet.add(Dense(units=actions.shape[0], activation='softmax'))

"""
model_leNet.add(BatchNormalization())
model_leNet.add(MaxPooling2D(pool_size=(2,2), strides = (2,2)))

#Block 4
model_leNet.add(Conv2D(filters=32, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(Conv2D(filters=48, kernel_size=(5,5), padding='same', activation='relu'))
model_leNet.add(BatchNormalization())
model_leNet.add(MaxPooling2D(pool_size=(2,2), strides = (2,2)))

model_leNet.add(Flatten())
model_leNet.add(Dense(128, activation='relu'))
model_leNet.add(Dense(units=actions.shape[0], activation='softmax'))
#model_leNet.add(Dense(len(train_ds.unique_labels), activation='softmax'))
"""

model_leNet.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_30 (Conv2D)          (None, 150, 100, 32)      80032     
                                                                 
 batch_normalization_29 (Ba  (None, 150, 100, 32)      128       
 tchNormalization)                                               
                                                                 
 conv2d_31 (Conv2D)          (None, 150, 100, 48)      38448     
                                                                 
 batch_normalization_30 (Ba  (None, 150, 100, 48)      192       
 tchNormalization)                                               
                                                                 
 max_pooling2d_14 (MaxPooli  (None, 75, 50, 48)        0         
 ng2D)                                                           
                                                      

In [30]:
#Fixed learning rate: adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0003) #0.001 can be changed.

#learning rate decay for Adam
initial_learning_rate = 0.0003  # Starting learning rate
decay_steps = 100000           # After how many steps to apply decay
decay_rate = 0.96              # Decay rate
staircase = True               # Apply decay in a staircase fashion

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps,
    decay_rate,
    staircase=staircase
)

adam_opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule) 

In [31]:

#code previously
model_leNet.compile(optimizer=adam_opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
#Train the model - MobileNetV3Small as pretrained model. Tensorboard callback.
model_leNet.fit(X_train, y_train, epochs=1000, validation_data = (X_dev, y_dev), batch_size = 8)
#gradient clipping for adam and L2 regularization for LSTM. -> look at the code prev.


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000

KeyboardInterrupt: 

In [None]:
#save model
model_leNet.save('FD_first_leNet.keras')