In [None]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [None]:
# !pip install --upgrade arabic-reshaper
import arabic_reshaper
# !pip install python-bidi
#!pip install pyttsx3 
from bidi.algorithm import get_display

from PIL import ImageFont, ImageDraw, Image

# Keypoints using MP Holistic

In [None]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [None]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [None]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# Preprocess Data and Create Labels and Features

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data 4')

# Actions that we try to detect
actions = np.array(['ineed', 'ambulance', 'where', 'street', 'thankyou', 'work',
                   'bootcamp', 'in this', 'i_need_ambulance', 'i_want', 'report', 'accedint'])

# Thirty videos worth of data
no_sequences = 10

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
# start_folder = 30

In [None]:
label_map = {label: num for num, label in enumerate(actions)}

In [None]:
label_map

In [None]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
np.array(sequences).shape

In [None]:
np.array(labels).shape

In [None]:
X = np.array(sequences)

In [None]:
X.shape

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=5)

In [None]:
y_train.shape

In [None]:
y_test.shape

# Build and Train GRU Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
model = Sequential()
model.add(GRU(256, return_sequences=True, activation='tanh', input_shape=(30,1662)))
model.add(GRU(128, return_sequences=False, activation='tanh'))
# model.add(GRU(64, return_sequences=False, activation='tanh'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [None]:
model.summary()

In [None]:
checkpoint_filepath = './tmp/checkpoint'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=2000, batch_size=5, validation_split=.2, shuffle=True,
          callbacks=[tb_callback,
                     EarlyStopping(patience=18, verbose=1, restore_best_weights=True),
                     ReduceLROnPlateau(factor=.5, patience=3, verbose=1)])

# Make Predictions

In [None]:
res = model.predict(X_test)

In [None]:
actions[np.argmax(res[4])]

In [None]:
actions[np.argmax(y_test[4])]

# Save Weights

In [None]:
model.save('action.h5')

In [None]:
# del model

In [None]:
model.load_weights('action.h5')

# Evaluation using Confusion Matrix and Accuracy

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_test)

In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)

In [None]:
from tensorflow.keras.losses import CategoricalCrossentropy


In [None]:
cc = CategoricalCrossentropy()
cc(y_test, model.predict(X_test)).numpy()

# Test in Real Time

In [None]:
from scipy import stats
 
import pyttsx3
from gtts import gTTS  
from playsound import playsound  

In [None]:
colors = [(245,117,16), (117,245,16), (16,117,245),(16,117,245)]*4
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5
Text_Speech=pyttsx3.init()

dict1 =  {"احتاج سيارة اسعاف": "i_need_amubalance" }


cap = cv2.VideoCapture('./ambulance 8.avi')
# cap = cv2.VideoCapture('./WhatsApp Video 2021-12-08 at 14.31.03.mp4')

# cap = cv2.VideoCapture(0)
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('output2.avi', fourcc, 30.0, (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),  
                                                    int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        if not ret: # end of video
            break
        

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
#         print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        cv2.putText(image, str(len(sequence)), (30,100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)], format(res[np.argmax(res)], '.0%'))
            # convert text to speech
            Text_Speech.say(actions[np.argmax(res)])
        
            Text_Speech.runAndWait()
            obj = gTTS(text="شكراً لهذا المعسكر سدايا", lang='ar', slow=False)  

            #Here we are saving the transformed audio in a mp3 file named  
            # exam.mp3  
            obj.save("exam.mp3")  

            # Play the exam.mp3 file  
            playsound("exam.mp3")  
            
            predictions.append(np.argmax(res))
            
        #3. logic
#             if np.unique(predictions[-10:])[0]==np.argmax(res): 
            if res[np.argmax(res)] > threshold: 
                

                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

                sequence = [] # start new sequence

            if len(sentence) > 5: 
                sentence = sentence[-3:]

            # Viz probabilities
#             image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        text = "احتاج مساعدة اين الشارع"
#         text = ' '.join([dict1.get(i, i) for i in sentence])
        
        reshaped_text = arabic_reshaper.reshape(text)    # correct its shape
        bidi_text = get_display(reshaped_text)           # correct its direction
        fontpath = "arial.ttf" # <== https://www.freefontspro.com/14454/arial.ttf  
        font = ImageFont.truetype(fontpath, 32)
        img_pil = Image.fromarray(image)
        draw = ImageDraw.Draw(img_pil)
        draw.text((100, 80),bidi_text, font = font)
        img = np.array(img_pil)
        

        
        # Show to screen
        cv2.imshow('OpenCV Feed', img)
        out.write(image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    

    out.release()
    cap.release()
    cv2.destroyAllWindows()

In [None]:
out.release()
cap.release()
cv2.destroyAllWindows()

In [None]:
sentence=['a']
text = ' '.join([dict1.get(i, i) for i in sentence])
text

In [None]:
# from imutils.video import FileVideoStream
# from imutils.video import WebcamVideoStream
# import imutils

In [None]:
# # 1. New detection variables
# sequence = []
# sentence = []
# predictions = []
# threshold = 0.7
# i = 0

# # cap = cv2.VideoCapture('./ineed-9_hiz8v33F.mp4')
# fvs = WebcamVideoStream(0).start()
# # fourcc = cv2.VideoWriter_fourcc(*'MJPG')
# # out = cv2.VideoWriter('output2.avi', fourcc, 30.0, (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),  
# #                                                     int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

# # Set mediapipe model 
# with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
#     while True:
            
#         # Read feed
#         frame = fvs.read()
        
#         if i == 30:
#             i = 1
#         else:
#             i+=1
#         cv2.putText(image, str(i), (50,5), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

#         # Make detections
#         image, results = mediapipe_detection(frame, holistic)
# #         print(results)
        
#         # Draw landmarks
#         draw_styled_landmarks(image, results)
        
#         # 2. Prediction logic
#         keypoints = extract_keypoints(results)
#         sequence.append(keypoints)
#         sequence = sequence[-30:]
        
        
#         if len(sequence) == 30:
#             res = model.predict(np.expand_dims(sequence, axis=0))[0]
# #             print(actions[np.argmax(res)])
#             predictions.append(np.argmax(res))
            
            
#         #3. logic
#             if np.unique(predictions[-10:])[0]==np.argmax(res): 
#                 if res[np.argmax(res)] > threshold: 
                    
#                     if len(sentence) > 0: 
#                         if actions[np.argmax(res)] != sentence[-1]:
#                             sentence.append(actions[np.argmax(res)])
#                     else:
#                         sentence.append(actions[np.argmax(res)])
                    
#                     sequence = [] # start new sequence

#             if len(sentence) > 5: 
#                 sentence = sentence[-5:]

#             # Viz probabilities
#             image = prob_viz(res, actions, image, colors)
            
#         cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
#         cv2.putText(image, ' '.join(sentence), (3,30), 
#                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
# #         print(' '.join(sentence), end=' ')
        
#         # Show to screen
#         cv2.imshow('OpenCV Feed', image)
# #         out.write(image)
        
#         if cv2.waitKey(10) & 0xFF == ord('q'):
#             break
    

# #     out.release()
#     cv2.destroyAllWindows()
#     fvs.stop()
#     fvs.stream.release()

In [None]:
# cv2.destroyAllWindows()
# fvs.stop()
# fvs.stream.release()