# Import dependencies

In [1]:
import os
import numpy as np
import cv2
from matplotlib import pyplot as plt
import mediapipe as mp

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

# Capture keypoints

In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
def mediapipe_detection(image,model):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return results, image

In [4]:
def draw_landmarks(image,results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(200,0,50), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(160,0,0), thickness=1, circle_radius=1))
    
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2),
                             mp_drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2))
    
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2),
                             mp_drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2))
    
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2),
                             mp_drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2))

In [5]:
def render():     # Not really needed, only for understanding perpose
    camera = cv2.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence = 0.7, min_tracking_confidence = 0.7) as holistic:
        while camera.isOpened():
            success, frame = camera.read()
    
            results, image = mediapipe_detection(frame,holistic)
        
            draw_landmarks(image, results)
    
            cv2.imshow('Video',image)
    
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
        camera.release()
        cv2.destroyAllWindows()
    return results

In [50]:
#results = render()

# Extract Keypoints

In [5]:
def extract_keypoints(results):
    pose = np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose,face,lh,rh])

# Folder Setup

In [6]:
"""actions = np.array(['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S',
                    'T','U','V','W','X','Y','Z','HELLO','HEY','_I','MY','NAME','YOU','YOUR','HOW ARE YOU',
                    'I\'M FINE','NICE','TO MEET YOU','WHERE','FROM','LIKE','WORK','MOVIE','WHAT',
                    'WHAT TIME IS IT','WASHROOM','MEANING','SORRY','BAD','LOVE','WITH','WANT TO',
                    'FAVOURITE','SIGN','PLEASE','THANK YOU','SEE YOU LATER','GOOD','MORNING','AFTERNOON',
                    'NIGHT','TODAY','GO','COME','EXCUSE ME','GOOD BYE','TAKE CARE','FOR WATCHING','HAVE',
                    'DAY','YES','NO','KNOW','LITTLE','EVERYONE','0','1','2','3','4','5','6','7','8','9']) """

actions = np.array(['HELLO','HEY','THANK YOU','GOOD','MORNING'])

DATA_PATH = os.path.join('DATA')
no_of_videos = 30
no_of_frames = 30

In [9]:
def setup_folder(actions):
    for action in actions:
        for video_no in range(no_of_videos):
            try:
                os.makedirs(os.path.join(DATA_PATH,action,str(video_no)))
            except:
                print("Cannot create directorie!")

In [10]:
setup_folder(actions)

# Data collection

In [16]:
def data_collect(actions):
    no_of_videos = 30
    no_of_frames = 30
    camera = cv2.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
        for action in actions:
            for video_no in range(no_of_videos):
                for frame_no in range(no_of_frames):
                    success, frame = camera.read()

                    results, image = mediapipe_detection(frame,holistic)

                    draw_landmarks(image, results)
                    
                    if frame_no == 0:
                        cv2.putText(image, 'Sart Recording...',(150,250),cv2.FONT_HERSHEY_SIMPLEX,
                                   1,(0,255,0),1,cv2.LINE_AA)
                        cv2.putText(image,'Collecting data for: {}  video no: {}'.format(action,video_no),
                                    (15,20),cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0),1,cv2.LINE_AA)
                        
                        cv2.imshow('Data Collection...',image)
                        
                        cv2.waitKey(10)
                    else:
                        cv2.putText(image,'Collecting data for: {}  video no: {}'.format(action,video_no),
                                    (15,20),cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0),1,cv2.LINE_AA)
                        cv2.imshow('Data Collection...',image)
                    
                    keypoint_array = extract_keypoints(results)
                    
                    array_path = os.path.join(DATA_PATH,action,str(video_no),str(frame_no))
                    
                    np.save(array_path,keypoint_array)

                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break
        camera.release()
        cv2.destroyAllWindows()

In [17]:
data_collect(actions)

#  Data preprocessing

In [7]:
label_map = {label:num for num, label in enumerate(actions)}

In [8]:
label_map

{'HELLO': 0, 'HEY': 1, 'THANK YOU': 2, 'GOOD': 3, 'MORNING': 4}

In [52]:
def preprocess_data(actions):
    videos, labels = [], []
    for action in actions:
        for video_no in range(no_of_videos):
            video_array = []
            for frame_no in range(no_of_frames):
                res = np.load(os.path.join(DATA_PATH,action,str(video_no), "{}.npy".format(frame_no)))
                video_array.append(res)
            videos.append(video_array)
            labels.append(label_map[action])
    return np.array(videos), to_categorical(np.array(labels)).astype(int)

In [53]:
x, y = preprocess_data(actions)

In [55]:
#test
x.shape

(150, 30, 1662)

In [56]:
def data_partition(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x , y, test_size = 0.05)
    return x_train, x_test, y_train, y_test

In [57]:
 x_train, x_test, y_train, y_test = data_partition(x,y)

# TensorBoard Callback setup

In [21]:
def tb_setup():
    log_dir = os.path.join('Logs')
    tb_callback = TensorBoard(log_dir=log_dir)
    return tb_callback

In [22]:
tb_callback = tb_setup()

In [23]:
tb_callback

<keras.callbacks.TensorBoard at 0x2b8968e1b80>

# Create Neural Network Architecture

In [9]:
def setup_neuralnet(actions):
    model = Sequential()
    model.add(LSTM(64,return_sequences = True, activation = 'relu', input_shape = (30,1662)))
    model.add(LSTM(128, return_sequences = True, activation = 'relu'))
    model.add(LSTM(64, return_sequences = False, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(actions.shape[0],activation = 'softmax'))
    return model

In [10]:
model = setup_neuralnet(actions)

# Compile the model

In [11]:
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])

# Train the model

In [61]:
model.fit(x_train, y_train, epochs = 1000,callbacks = [tb_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000

KeyboardInterrupt: 

# Testing for predictions

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 64)            442112    
                                                                 
 lstm_1 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
Total params: 596,741
Trainable params: 596,741
Non-trai

In [63]:
res = model.predict(x_test)

In [71]:
actions[np.argmax(res[3])]

'THANK YOU'

In [72]:
actions[np.argmax(y_test[3])]

'THANK YOU'

# Saving the trained model weight

In [73]:
model.save('model.h5')

In [12]:
# to reload model-weight
model.load_weights('model.h5')

# Evaluating the performance using Confusion Matrix

In [14]:
 from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [15]:
yhat = model.predict(x_test)

NameError: name 'x_test' is not defined

In [76]:
ytrue = np.argmax(y_test,axis=1).tolist()
yhat = np.argmax(yhat,axis=1).tolist()

In [77]:
multilabel_confusion_matrix(ytrue,yhat)

array([[[6, 0],
        [0, 2]],

       [[3, 1],
        [0, 4]],

       [[7, 0],
        [1, 0]],

       [[7, 0],
        [0, 1]]], dtype=int64)

In [78]:
accuracy_score(ytrue,yhat)

0.875

# Real-time prediction

In [13]:
def real_time_detection(model,actions):
    video = []
    sentence = []
    threshold = 0.4
    camera = cv2.VideoCapture(0)
    with mp_holistic.Holistic(min_detection_confidence = 0.7, min_tracking_confidence = 0.7) as holistic:
        while camera.isOpened():
            success, frame = camera.read()
    
            results, image = mediapipe_detection(frame,holistic)
        
            draw_landmarks(image, results)
            
            keypoints = extract_keypoints(results)
            video.append(keypoints)
            video = video[-30:]
            if results.left_hand_landmarks or results.right_hand_landmarks:
                if len(video) == 30:
                    res = model.predict(np.expand_dims(video,axis=0))[0]
            
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0:
                            if sentence[-1] != actions[np.argmax(res)]:
                                sentence.append(actions[np.argmax(res)])  
                        else:
                            sentence.append(actions[np.argmax(res)])
                    
                        if len(sentence) > 5:
                            sentence = sentence[-5:]
                
                
            cv2.rectangle(image,(0,0),(640,40),(245,116,17),-1)
                
            cv2.putText(image, ' '.join(sentence),(3,30), cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),
                           2,cv2.LINE_AA)
    
            cv2.imshow('Sign Language Detection',image)
    
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
        camera.release()
        cv2.destroyAllWindows()

In [15]:
real_time_detection(model,actions)