In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import keras
from PIL import Image, ImageDraw, ImageFont

In [2]:
mp_hol = mp.solutions.holistic # model
mp_draw = mp.solutions.drawing_utils

In [3]:
def mp_detect(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    result = model.process(image) # Make prediction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, result

In [4]:
def draw_lm(image, results):
    mp_draw.draw_landmarks(image, results.face_landmarks, mp_hol.FACEMESH_CONTOURS)
    mp_draw.draw_landmarks(image, results.pose_landmarks, mp_hol.POSE_CONNECTIONS)
    mp_draw.draw_landmarks(image, results.left_hand_landmarks, mp_hol.HAND_CONNECTIONS)
    mp_draw.draw_landmarks(image, results.right_hand_landmarks, mp_hol.HAND_CONNECTIONS)

In [5]:
def draw_styled_lm(image, results):
    mp_draw.draw_landmarks(image, results.face_landmarks, mp_hol.FACEMESH_CONTOURS, 
                             mp_draw.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_draw.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    mp_draw.draw_landmarks(image, results.pose_landmarks, mp_hol.POSE_CONNECTIONS,
                             mp_draw.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_draw.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    mp_draw.draw_landmarks(image, results.left_hand_landmarks, mp_hol.HAND_CONNECTIONS, 
                             mp_draw.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_draw.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )   
    mp_draw.draw_landmarks(image, results.right_hand_landmarks, mp_hol.HAND_CONNECTIONS, 
                             mp_draw.DrawingSpec(color=(155,117,66), thickness=2, circle_radius=4), 
                             mp_draw.DrawingSpec(color=(155,66,230), thickness=2, circle_radius=2)
                             ) 

In [6]:
def ext_keypoints(results1):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results1.pose_landmarks.landmark]).flatten() if results1.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results1.face_landmarks.landmark]).flatten() if results1.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results1.left_hand_landmarks.landmark]).flatten() if results1.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results1.right_hand_landmarks.landmark]).flatten() if results1.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [19]:
DATA_PATH = os.path.join('Data_1') 

actions = np.array(['сәлеметсіз бе', 'рақмет', 'сау болыңыз'])

no_seq = 30

seq_length = 30

start_folder = 1

In [20]:
for action in actions: 
    for sequence in range(1,no_seq+1):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [21]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [22]:
label_map = {label:num for num, label in enumerate(actions)}

In [23]:
sequences, labels = [], []
for action in actions:
    for sequence in range(1, no_seq):
        window = []
        for frame_num in range(seq_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [24]:
X = np.array(sequences)

In [25]:
y = to_categorical(labels).astype(int)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [27]:
#build and train lstm
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import TensorBoard
from keras.layers import Input
from keras import optimizers
import tensorflow as tf

In [28]:
log_dir1 = os.path.join('Logs_4.0')
tb_callback = TensorBoard(log_dir=log_dir1)

In [29]:
model = Sequential()
model.add(Input(shape=(30,1662)))
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [30]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [73]:
model.fit(X_train, y_train,epochs=300, callbacks=[tb_callback])

Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - categorical_accuracy: 0.2705 - loss: 2.6911
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - categorical_accuracy: 0.2065 - loss: 29.7990
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - categorical_accuracy: 0.2893 - loss: 58.2727
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - categorical_accuracy: 0.1894 - loss: 20.6458
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - categorical_accuracy: 0.2102 - loss: 17.3294
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - categorical_accuracy: 0.1946 - loss: 117.9280
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - categorical_accuracy: 0.2841 - loss: 58.5065
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step

<keras.src.callbacks.history.History at 0x1f39f64fd40>

In [31]:
#make predictions
res = model.predict(X_test)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 688ms/step


In [57]:
del model

In [32]:
#save weight
model.load_weights('action_qz.h5')

In [33]:
#Evaluation using Confusion Matrix and Accuracy
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [34]:
y_hat = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


In [35]:
y_true = np.argmax(y_test, axis=1).tolist()
y_hat = np.argmax(y_hat, axis=1).tolist()

In [36]:
multilabel_confusion_matrix(y_true, y_hat)

array([[[3, 0],
        [0, 2]],

       [[3, 0],
        [0, 2]],

       [[4, 0],
        [0, 1]]], dtype=int64)

In [37]:
accuracy_score(y_true, y_hat)

1.0

In [38]:
#test
from scipy import stats

In [39]:
colors = [(155, 117, 16), (117, 155, 16), (16, 117, 155)]

def prob_viz_with_pil(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    font_size = 1
    font_scale = 1.5
    font = ImageFont.truetype("arial.ttf", 24)

    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0, 60 + num * 40), (int(prob * 100), 90 + num * 40), colors[num], -1)

        pil_img = Image.fromarray(output_frame)
        draw = ImageDraw.Draw(pil_img)

        text_bbox = draw.textbbox((0, 0), actions[num], font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]

        text_x = 10
        text_y = 60 + num * 40 + (30 - text_height) // 2
        draw.text((text_x, text_y), actions[num], fill=(255, 255, 255), font=font)

        output_frame = np.array(pil_img)

    return output_frame

In [41]:
sequence = []
sentence = []
predictions = []
threshold = 0.8

cap = cv2.VideoCapture(0)

with mp_hol.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, result = mp_detect(frame, holistic)
        draw_styled_lm(image, result)

        keypoints = ext_keypoints(result)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))

            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5:
                sentence = sentence[-5:]

            image = prob_viz_with_pil(res, actions, image, colors)

        cv2.rectangle(image, (0, 0), (640, 40), (155, 117, 16), -1)

        pil_img = Image.fromarray(image)
        draw = ImageDraw.Draw(pil_img)
        font = ImageFont.truetype("arial.ttf", 24)

        text_bbox = draw.textbbox((3, 5), ' '.join(sentence), font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]

        text_x = 3
        text_y = 5

        if text_y + text_height > 40:
            text_y = 40 - text_height - 5 

        draw.text((text_x, text_y), ' '.join(sentence), fill=(255, 255, 255), font=font)
        image = np.array(pil_img)

        cv2.imshow('QazSign', image)

        if cv2.waitKey(10) & 0xFF == ord('d'):
            sentence.clear()

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2