# 1 - install dependencies

In [1]:
import numpy as np
import cv2 
import os
import mediapipe as mp
import tensorflow as tf 
from tensorflow import keras
import matplotlib.pyplot as plt
import time
import pandas as pd

2022-05-23 21:46:08.231782: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/mina/.local/lib/python3.8/site-packages/cv2/../../lib64:
2022-05-23 21:46:08.231799: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

num_hand_marks = 21
num_pose_marks = 33


pose_selected_landmarks = [
    [0,2,5,11,13,15,12,14,16],
    [0,2,4,5,8,9,12,13,16,17,20],
    [0,2,4,5,8,9,12,13,16,17,20],
]

def draw_updated_styled(image,results):
    image_rows, image_cols, _ = image.shape
    
    original_landmarks = [
        results.pose_landmarks,
        results.left_hand_landmarks,
        results.right_hand_landmarks
    ]

    
    for shape in range(3):
        if(original_landmarks[shape]):
            lis = original_landmarks[shape].landmark
            for idx in pose_selected_landmarks[shape]:
                point = lis[idx]
                landmark_px = mp_drawing._normalized_to_pixel_coordinates(point.x, point.y,
                                                           image_cols, image_rows)

                cv2.circle(image, landmark_px, 2, (0,0,255),
                         4)     
                
def extract_keypoints(results):
    
    original_landmarks = [
        results.pose_landmarks,
        results.left_hand_landmarks,
        results.right_hand_landmarks
    ]
    
    outputs = []
    for shape in range(3):
        if(original_landmarks[shape]):
            lis = original_landmarks[shape].landmark
            pose = np.array([ [lis[res].x,lis[res].y] for res in pose_selected_landmarks[shape] ]).flatten()
        else:
            pose = np.zeros(len(pose_selected_landmarks[shape])*2)
        outputs.append(pose)
    return np.concatenate([outputs[0],outputs[1],outputs[2]])



# holistic model process image and return the results as keypoints
def mediapipe_detection(image,model):
    image  = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image  = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return image,results
    
    


def draw_landmark_from_array(image,keyPoints):
    image_rows, image_cols, _ = image.shape
    
    
    for i in range(len(keyPoints)//2):
        x = keyPoints[i*2]
        y = keyPoints[i*2+1]
        if(x!=0 and y!=0): 
            landmark_px = mp_drawing._normalized_to_pixel_coordinates(x,y,
                                                       image_cols, image_rows)
            cv2.circle(image, landmark_px, 2, (0,0,255),
                     4)

                


In [3]:
actions = ['one','you','teacher','girl','tomorrow','mom','look','crazy','walk','agree','family','friends','get_to_know','help','how_are_you','love','street','teach_me','thank_yuo','want']
dic={}
for i,action in enumerate(actions):
    dic[action]=i

print(dic)

{'one': 0, 'you': 1, 'teacher': 2, 'girl': 3, 'tomorrow': 4, 'mom': 5, 'look': 6, 'crazy': 7, 'walk': 8, 'agree': 9, 'family': 10, 'friends': 11, 'get_to_know': 12, 'help': 13, 'how_are_you': 14, 'love': 15, 'street': 16, 'teach_me': 17, 'thank_yuo': 18, 'want': 19}


# 4 - build and train the model

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense,Input,Dropout
from tensorflow.keras.models import Model

best_model_file_name = os.path.join("weights","best.h5")



In [9]:
def final_model():
    input_layer = Input(shape=(16,62))
    layer = LSTM(64,return_sequences=True,activation="relu")(input_layer)
    layer = LSTM(128,return_sequences=True,activation="relu")(layer)
    layer = LSTM(256,return_sequences=True,activation="relu")(layer)
    layer = LSTM(96,return_sequences=False,activation="relu")(layer)
    layer = Dense(64,activation="relu")(layer)
    layer = Dense(len(actions),activation="softmax")(layer)


    model = Model(inputs=input_layer,outputs=layer)
    model.compile(optimizer="Adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model



model = final_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 16, 62)]          0         
                                                                 
 lstm_8 (LSTM)               (None, 16, 64)            32512     
                                                                 
 lstm_9 (LSTM)               (None, 16, 128)           98816     
                                                                 
 lstm_10 (LSTM)              (None, 16, 256)           394240    
                                                                 
 lstm_11 (LSTM)              (None, 96)                135552    
                                                                 
 dense_4 (Dense)             (None, 64)                6208      
                                                                 
 dense_5 (Dense)             (None, 20)                1300  

In [10]:
model.load_weights(os.path.join(best_model_file_name))

# 5 - test in real time

In [11]:
import numpy as np
import cv2 
import os
import mediapipe as mp
import tensorflow as tf 
from tensorflow import keras
import matplotlib.pyplot as plt
import time
import pandas as pd

In [12]:

def view_probability(res, actions, image):
    #output_frame = input_frame.copy()
    
    output_frame = image
    height,width,_ = image.shape
    
    if(type(res)==list or type(res)==np.ndarray):
        max_prob_index = np.argmax(res)
        max_prob = res[max_prob_index]
        text = f'{actions[max_prob_index]} - {max_prob:.3f}'
    else:
        text = "not signing"
    
    
    cv2.rectangle(output_frame, (0,0), (width, 40), (0,255,0), -1)
    cv2.putText(output_frame, text, (0, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame



def view_sentence(sentence,image):
    # output_frame = image.copy()
    output_frame = image
    height,width,_ = image.shape
    cv2.rectangle(output_frame, (0,height-40), (width, height), (255, 0, 0), -1)
    cv2.putText(output_frame, ' '.join(sentence), (10,height-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    return output_frame


def compare_frames(prev_frame,current_frame,threshold):
    return True
    if(type(prev_frame) == np.ndarray ):
        diff = cv2.absdiff(prev_frame, current_frame)
        s = diff.sum()
        if(s > threshold):
            return True
        
    return False
    
def evaluate_list_comparisons(s):
    return True
    if(sum(s)>=2):
        return True
    return False

In [14]:
# 1. New detection variables
sequence = [np.zeros(62)]
sentence = []
predictions = []
threshold = 0.5
n_frames = 16


def fill_keypoints(array):
    output = array.copy()
    for i in range(2,len(output)):
        current_frame = output[i]
        prev_prev_frame = output[i-2]
        prev_frame = output[i-1]
        for index,num in enumerate(current_frame):
            if num==0:
                current_frame[index] = prev_frame[index]*2 - prev_prev_frame[index]
                
    return output
                

res = None

cap = cv2.VideoCapture(0)
# Set mediapipe model 

last_comparisons = []
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
prev_frame = None
while cap.isOpened():

    ret, frame = cap.read()
    if(not ret):break
        
    frame = cv2.resize(frame,(640,480))
        
        
    comparison_result = compare_frames(prev_frame,frame,4000000)
    prev_frame = frame
    
    last_comparisons.append(int(comparison_result))
    last_comparisons = last_comparisons[-2:]
    
    
    

    image, results = mediapipe_detection(frame, holistic)
    # draw_updated_styled(image, results)

    # 2. Prediction logic
    
    
    if(evaluate_list_comparisons(last_comparisons)):
        keypoints = extract_keypoints(results)
#         for ind,n in enumerate(keypoints):
#             if keypoints[ind]==0:
#                 keypoints[ind] = sequence[-1][ind]
        
        sequence.append(keypoints)
        sequence = sequence[-n_frames:]
        sequence = fill_keypoints(sequence)
        
        draw_landmark_from_array(image,keypoints)

        if len(sequence) == n_frames:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))


        #3. Viz logic
            if np.unique(predictions[-2:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 

                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]
    else:
        res = None


    image = cv2.resize(image,(840,640))
    
    image = view_probability(res, actions, image)
    image = view_sentence(sentence,image)
    

    
    cv2.imshow('OpenCV Feed', image)

    # Break
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

In [24]:
cap.release()
cv2.destroyAllWindows()

(640, 840, 3)

In [12]:
cap.release()

In [13]:
cv2.destroyAllWindows()

In [26]:
old = np.array([5,2,2,2,2,2])
test = np.array([5,2,3,0,5,1])


In [30]:
test&test

array([5, 2, 3, 0, 5, 1])

In [32]:
for ind,n in enumerate(test):
    if test[ind]==0:test[ind] = old[ind]

In [33]:
test

array([5, 2, 3, 2, 5, 1])