In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [2]:
# !pip install --upgrade arabic-reshaper
import arabic_reshaper
# !pip install python-bidi
#!pip install pyttsx3 
from bidi.algorithm import get_display

from PIL import ImageFont, ImageDraw, Image

# Keypoints using MP Holistic

In [3]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [5]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [6]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# Preprocess Data and Create Labels and Features

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [8]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data 4')

# Actions that we try to detect
actions = np.array(['ineed', 'ambulance', 'where', 'street', 'thankyou', 'work',
                   'bootcamp', 'in this', 'i_need_ambulance', 'i_want', 'report', 'accedint'])

# Thirty videos worth of data
no_sequences = 10

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
# start_folder = 30

In [9]:
label_map = {label: num for num, label in enumerate(actions)}

In [10]:
label_map

{'ineed': 0,
 'ambulance': 1,
 'where': 2,
 'street': 3,
 'thankyou': 4,
 'work': 5,
 'bootcamp': 6,
 'in this': 7,
 'i_need_ambulance': 8,
 'i_want': 9,
 'report': 10,
 'accedint': 11}

In [28]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'MP_Data 4\\ineed'

In [17]:
np.array(sequences).shape

(480, 30, 1662)

In [18]:
np.array(labels).shape

(480,)

In [19]:
X = np.array(sequences)

In [20]:
X.shape

(480, 30, 1662)

In [21]:
y = to_categorical(labels).astype(int)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=5)

In [26]:
y_train.shape

(384, 12)

In [27]:
y_test.shape

(96, 12)

# Build and Train GRU Neural Network

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [12]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [13]:
model = Sequential()
model.add(GRU(256, return_sequences=True, activation='tanh', input_shape=(30,1662)))
model.add(GRU(128, return_sequences=False, activation='tanh'))
# model.add(GRU(64, return_sequences=False, activation='tanh'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 30, 256)           1474560   
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               148224    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 12)                396       
Total params: 1,633,516
Trainable params: 1,633,516
Non-trainable params: 0
_________________________________________________________________


In [32]:
checkpoint_filepath = './tmp/checkpoint'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [33]:
model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [34]:
model.fit(X_train, y_train, epochs=2000, batch_size=5, validation_split=.2, shuffle=True,
          callbacks=[tb_callback,
                     EarlyStopping(patience=18, verbose=1, restore_best_weights=True),
                     ReduceLROnPlateau(factor=.5, patience=3, verbose=1)])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 00038: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000


Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 00045: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 00052: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 00063: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
Epoch 73/2000
Epoch 74/2000
Epoch 75/2000
Epoch 76/2000
Epoch 77/2000
Epoch 78/2000
Epoch 79/2000
Epoch 80/2000
Epoch 81/2000
Epoch 82/2000
Epoch 00082: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 83/2000
Epoch 84/2000
Epoch 85/2000
Epoch 86/2000
Epoch 87/2000
Epoch 88/2000
Epoch 89/2000
Epoch 90/2000
Epoch 91

Epoch 126/2000
Epoch 127/2000
Epoch 128/2000
Epoch 129/2000
Epoch 130/2000
Epoch 131/2000
Epoch 132/2000
Epoch 133/2000
Epoch 134/2000
Epoch 135/2000
Epoch 136/2000
Epoch 137/2000
Epoch 138/2000
Epoch 139/2000
Epoch 00139: ReduceLROnPlateau reducing learning rate to 9.765625463842298e-07.
Epoch 140/2000
Epoch 141/2000
Epoch 142/2000
Epoch 143/2000
Epoch 144/2000
Epoch 145/2000
Epoch 146/2000
Epoch 147/2000
Epoch 148/2000
Epoch 149/2000
Epoch 150/2000
Epoch 151/2000
Epoch 00151: ReduceLROnPlateau reducing learning rate to 4.882812731921149e-07.
Epoch 152/2000
Epoch 153/2000
Epoch 154/2000
Epoch 155/2000
Epoch 00155: ReduceLROnPlateau reducing learning rate to 2.4414063659605745e-07.
Epoch 156/2000
Epoch 157/2000
Epoch 158/2000
Epoch 00158: ReduceLROnPlateau reducing learning rate to 1.2207031829802872e-07.
Epoch 159/2000
Epoch 160/2000
Epoch 161/2000
Epoch 00161: ReduceLROnPlateau reducing learning rate to 6.103515914901436e-08.
Epoch 162/2000
Epoch 163/2000
Epoch 164/2000
Epoch 00164: 

Epoch 198/2000
Epoch 199/2000
Epoch 200/2000
Epoch 00200: ReduceLROnPlateau reducing learning rate to 7.450580950807417e-12.
Epoch 201/2000
Epoch 202/2000
Epoch 203/2000
Epoch 00203: ReduceLROnPlateau reducing learning rate to 3.725290475403709e-12.
Epoch 204/2000
Epoch 205/2000
Epoch 206/2000
Epoch 00206: ReduceLROnPlateau reducing learning rate to 1.8626452377018543e-12.
Epoch 207/2000
Epoch 208/2000
Epoch 209/2000
Epoch 00209: ReduceLROnPlateau reducing learning rate to 9.313226188509272e-13.
Epoch 210/2000
Epoch 211/2000
Epoch 00211: early stopping


<keras.callbacks.History at 0x1f90dfa5eb0>

# Make Predictions

In [35]:
res = model.predict(X_test)

In [36]:
actions[np.argmax(res[4])]

'street'

In [37]:
actions[np.argmax(y_test[4])]

'street'

# Save Weights

In [38]:
model.save('action.h5')

In [None]:
# del model

In [15]:
model.load_weights('action.h5')

# Evaluation using Confusion Matrix and Accuracy

In [18]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [19]:
yhat = model.predict(X_test)

NameError: name 'X_test' is not defined

In [41]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [42]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[87,  3],
        [ 0,  6]],

       [[90,  0],
        [ 2,  4]],

       [[89,  1],
        [ 0,  6]],

       [[90,  0],
        [ 1,  5]],

       [[82,  2],
        [ 2, 10]],

       [[83,  1],
        [ 2, 10]],

       [[81,  3],
        [ 2, 10]],

       [[83,  1],
        [ 1, 11]],

       [[90,  0],
        [ 1,  5]],

       [[90,  0],
        [ 0,  6]],

       [[90,  0],
        [ 0,  6]],

       [[90,  0],
        [ 0,  6]]], dtype=int64)

In [43]:
accuracy_score(ytrue, yhat)

0.8854166666666666

In [20]:
from tensorflow.keras.losses import CategoricalCrossentropy


In [45]:
cc = CategoricalCrossentropy()
cc(y_test, model.predict(X_test)).numpy()

0.45852625

# Test in Real Time

In [16]:
from scipy import stats
 
import pyttsx3
from gtts import gTTS  
from playsound import playsound  

In [17]:
colors = [(245,117,16), (117,245,16), (16,117,245),(16,117,245)]*4
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [58]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5
Text_Speech=pyttsx3.init()

dict1 =  {"احتاج سيارة اسعاف": "i_need_amubalance" }


cap = cv2.VideoCapture('./ambulance 8.avi')
# cap = cv2.VideoCapture('./WhatsApp Video 2021-12-08 at 14.31.03.mp4')

# cap = cv2.VideoCapture(0)
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('output2.avi', fourcc, 30.0, (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),  
                                                    int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        if not ret: # end of video
            break
        

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
#         print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        cv2.putText(image, str(len(sequence)), (30,100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)], format(res[np.argmax(res)], '.0%'))
            # convert text to speech
            Text_Speech.say(actions[np.argmax(res)])
        
            Text_Speech.runAndWait()
            obj = gTTS(text="شكراً لهذا المعسكر سدايا", lang='ar', slow=False)  

            #Here we are saving the transformed audio in a mp3 file named  
            # exam.mp3  
            obj.save("exam.mp3")  

            # Play the exam.mp3 file  
            playsound("exam.mp3")  
            
            predictions.append(np.argmax(res))
            
        #3. logic
#             if np.unique(predictions[-10:])[0]==np.argmax(res): 
            if res[np.argmax(res)] > threshold: 
                

                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

                sequence = [] # start new sequence

            if len(sentence) > 5: 
                sentence = sentence[-3:]

            # Viz probabilities
#             image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        text = "احتاج مساعدة اين الشارع"
#         text = ' '.join([dict1.get(i, i) for i in sentence])
        
        reshaped_text = arabic_reshaper.reshape(text)    # correct its shape
        bidi_text = get_display(reshaped_text)           # correct its direction
        fontpath = "arial.ttf" # <== https://www.freefontspro.com/14454/arial.ttf  
        font = ImageFont.truetype(fontpath, 32)
        img_pil = Image.fromarray(image)
        draw = ImageDraw.Draw(img_pil)
        draw.text((100, 80),bidi_text, font = font)
        img = np.array(img_pil)
        

        
        # Show to screen
        cv2.imshow('OpenCV Feed', img)
        out.write(image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    

    out.release()
    cap.release()
    cv2.destroyAllWindows()

ambulance 100%


In [40]:
out.release()
cap.release()
cv2.destroyAllWindows()

In [70]:
sentence=['a']
text = ' '.join([dict1.get(i, i) for i in sentence])
text

'a'

In [50]:
# from imutils.video import FileVideoStream
# from imutils.video import WebcamVideoStream
# import imutils

In [197]:
# # 1. New detection variables
# sequence = []
# sentence = []
# predictions = []
# threshold = 0.7
# i = 0

# # cap = cv2.VideoCapture('./ineed-9_hiz8v33F.mp4')
# fvs = WebcamVideoStream(0).start()
# # fourcc = cv2.VideoWriter_fourcc(*'MJPG')
# # out = cv2.VideoWriter('output2.avi', fourcc, 30.0, (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),  
# #                                                     int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

# # Set mediapipe model 
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
#     while True:
            
#         # Read feed
#         frame = fvs.read()
        
#         if i == 30:
#             i = 1
#         else:
#             i+=1
#         cv2.putText(image, str(i), (50,5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

#         # Make detections
#         image, results = mediapipe_detection(frame, holistic)
# #         print(results)
        
#         # Draw landmarks
#         draw_styled_landmarks(image, results)
        
#         # 2. Prediction logic
#         keypoints = extract_keypoints(results)
#         sequence.append(keypoints)
#         sequence = sequence[-30:]
        
        
#         if len(sequence) == 30:
#             res = model.predict(np.expand_dims(sequence, axis=0))[0]
# #             print(actions[np.argmax(res)])
#             predictions.append(np.argmax(res))
            
            
#         #3. logic
#             if np.unique(predictions[-10:])[0]==np.argmax(res): 
#                 if res[np.argmax(res)] > threshold: 
                    
#                     if len(sentence) > 0: 
#                         if actions[np.argmax(res)] != sentence[-1]:
#                             sentence.append(actions[np.argmax(res)])
#                     else:
#                         sentence.append(actions[np.argmax(res)])
                    
#                     sequence = [] # start new sequence

#             if len(sentence) > 5: 
#                 sentence = sentence[-5:]

#             # Viz probabilities
#             image = prob_viz(res, actions, image, colors)
            
#         cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
#         cv2.putText(image, ' '.join(sentence), (3,30), 
#                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
# #         print(' '.join(sentence), end=' ')
        
#         # Show to screen
#         cv2.imshow('OpenCV Feed', image)
# #         out.write(image)
        
#         if cv2.waitKey(10) & 0xFF == ord('q'):
#             break
    

# #     out.release()
#     cv2.destroyAllWindows()
#     fvs.stop()
#     fvs.stream.release()

In [208]:
# cv2.destroyAllWindows()
# fvs.stop()
# fvs.stream.release()