In [1]:
import cv2
import mediapipe as mp
import numpy as np
import time, os
import pandas as pd
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model
from PIL import ImageFont, ImageDraw, Image
from google.protobuf.json_format import MessageToDict

In [2]:
labels = pd.read_csv('./dataset_1/label.csv')
labels = labels.values

In [16]:
actions = labels
# LSTM Window Size
seq_length = 20

VIDEO_FILES = []
dir_path = './dataset_1/viedo'
for (root, directories, files) in os.walk(dir_path):
    for file in files:
        VIDEO_FILES.append(os.path.join(root, file))

# MediaPipe hands model
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    model_complexity=0,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

os.makedirs('dataset', exist_ok=True)
zero = np.zeros(15) 

for idx, file in enumerate(VIDEO_FILES):
    cap = cv2.VideoCapture(file)
    action = actions[idx][0]
    data = []

    while cap.isOpened():

        ret, img = cap.read()
        if not ret:
            break
            
        img = cv2.resize(img, dsize=(800,450))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        result = hands.process(img)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        if result.multi_hand_landmarks:
            data_arr = []
            right_hand, left_hand = np.zeros((21,3)), np.zeros((21,3))
            for res in result.multi_hand_landmarks:
                joint = np.zeros((21,3))
                for j, lm in enumerate(res.landmark):
                    joint[j] = [lm.x, lm.y, lm.z]

                # Compute angles between joints
                v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3] # Parent joint
                v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3] # Child joint
                v = v2 - v1 # [20, 3]
                # Normalize v
                v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

                # Get angle using arcos of dot product
                angle = np.arccos(np.einsum('nt,nt->n',
                    v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                    v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

                angle = np.degrees(angle) # Convert radian to degree

                angle_label = np.array(angle, dtype=np.float32)
                # angle_label = np.append(angle_label, idx)
                handedness_dict = MessageToDict(result.multi_handedness[0])
                if handedness_dict['classification'][0]['label'] == 'Right':
                    right_hand = joint
                else:
                    left_hand = joint
                    

                mp_drawing.draw_landmarks(img, res, mp_hands.HAND_CONNECTIONS)
            
                data_arr.extend(angle_label)
               
            if len(data_arr) == 15:
                handedness_dict = MessageToDict(result.multi_handedness[0])
                if handedness_dict['classification'][0]['label'] == 'Right':
                    data_arr = np.concatenate((zero, data_arr))
                else:
                    data_arr = np.concatenate((data_arr, zero))
            elif len(data_arr) > 30:
                continue
                
            hand_distance = left_hand - right_hand
            hand_distance /= np.linalg.norm(hand_distance, axis=1)[:, np.newaxis]
            data_arr = np.concatenate((data_arr, hand_distance.flatten()))
            data_arr = np.append(data_arr, idx)
            data.append(data_arr)
            
        cv2.imshow('img', img)
        if cv2.waitKey(1) == ord('q'):
            break
    
    data = np.array(data)

    try:
        full_seq_data = []
        for seq in range(len(data) - seq_length):
            full_seq_data.append(data[seq:seq + seq_length])
        full_seq_data = np.array(full_seq_data)

        if idx == 0:
            full_data = full_seq_data
            continue

        full_data = np.concatenate((full_data, full_seq_data))
        print(action,idx, full_data.shape)
        
    except:
        print('ERROR!!!!!!!!!!!!!!!!!', action, idx)
        pass

cv2.destroyAllWindows()
cap.release()

np.save(os.path.join('dataset/fulldata_angle_distance'), full_data)

개 1 (75, 20, 94)
공원 2 (112, 20, 94)
금요일 3 (156, 20, 94)
내년 4 (187, 20, 94)
내일 5 (209, 20, 94)
냄새나다 6 (236, 20, 94)
누나 7 (262, 20, 94)
동생 8 (292, 20, 94)
목요일 9 (321, 20, 94)
아래 10 (358, 20, 94)
바다 11 (402, 20, 94)
배고프다 12 (448, 20, 94)
병원 13 (476, 20, 94)
불 14 (518, 20, 94)
산 15 (551, 20, 94)
삼키다 16 (577, 20, 94)
선생님 17 (615, 20, 94)
수요일 18 (637, 20, 94)
아빠 19 (671, 20, 94)
아파트 20 (715, 20, 94)
앞 21 (740, 20, 94)
어제 22 (760, 20, 94)
어지러움 23 (788, 20, 94)
언니 24 (818, 20, 94)
엄마 25 (837, 20, 94)
오늘 26 (859, 20, 94)
오른쪽 27 (895, 20, 94)
오빠 28 (921, 20, 94)
올해 29 (961, 20, 94)
왼쪽 30 (1008, 20, 94)
월요일 31 (1028, 20, 94)
위에 32 (1073, 20, 94)
음식물 33 (1121, 20, 94)
일요일 34 (1174, 20, 94)
자동차 35 (1223, 20, 94)
작년 36 (1273, 20, 94)
집 37 (1297, 20, 94)
택시 38 (1339, 20, 94)
토요일 39 (1379, 20, 94)
학교 40 (1406, 20, 94)
형 41 (1424, 20, 94)
화요일 42 (1452, 20, 94)
화장실 43 (1473, 20, 94)
0 44 (1514, 20, 94)
1 45 (1563, 20, 94)
2 46 (1611, 20, 94)
3 47 (1657, 20, 94)
4 48 (1705, 20, 94)
5 49 (1754, 20, 94)
6 

3 372 (18273, 20, 94)
4 373 (18299, 20, 94)
5 374 (18328, 20, 94)
6 375 (18352, 20, 94)
7 376 (18379, 20, 94)
8 377 (18402, 20, 94)
9 378 (18424, 20, 94)
10 379 (18453, 20, 94)
가렵다 380 (18472, 20, 94)
개 381 (18513, 20, 94)


In [6]:
data = np.load('./dataset/fulldata_angle_distance.npy')
labels = pd.read_csv('./dataset_1/label.csv')
lab = labels.values

In [19]:
arr= set()

for i in lab:
    arr.add(i[0])
    
    
arr = list(arr)

In [20]:
for i in range(len(data)):
    num = data[i,0,-1]
    label_num = lab[int(num)][0]
    num = arr.index(label_num)
    data[i,:,-1] = num

In [34]:
x = data[:,:,:-1]
y = data[:,0,-1]

x.shape, y.shape

((18513, 20, 93), (18513,))

In [35]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y)

In [36]:
len(y[0])

88

In [37]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [39]:
clear_session()

model = Sequential()
model.add(LSTM(128, input_shape=(20, 93)))
model.add(Dense(128, activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(y.shape[-1], activation='softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               113664    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 88)                2904      
                                                                 
Total params: 143,416
Trainable params: 143,416
Non-trainable params: 0
_________________________________________________________________


In [40]:
import datetime

t = datetime.datetime.now()
t = str(t)[5:10] +'_' +str(t)[11:16]
t

'04-26_18:21'

In [41]:
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1, min_delta=0)
mc = ModelCheckpoint(f'model_{t}.h5', monitor='val_loss', save_best_only=True, verbose=1)


hist = model.fit(x_train, y_train, validation_split=0.2, epochs=100, callbacks=[es, mc], verbose=1, batch_size=32)

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.76335, saving model to model_04-26_18:21.h5
Epoch 2/100
Epoch 2: val_loss improved from 1.76335 to 1.19140, saving model to model_04-26_18:21.h5
Epoch 3/100
Epoch 3: val_loss improved from 1.19140 to 0.88472, saving model to model_04-26_18:21.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.88472 to 0.67125, saving model to model_04-26_18:21.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.67125 to 0.57576, saving model to model_04-26_18:21.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.57576 to 0.46268, saving model to model_04-26_18:21.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.46268 to 0.39841, saving model to model_04-26_18:21.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.39841 to 0.34291, saving model to model_04-26_18:21.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.34291 to 0.30966, saving model to model_04-26_18:21.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.30966 to 0.24896, saving model to model_04-26_1

In [42]:
model.save('model_0426_1821.h5')

In [44]:
y_pred = model.predict(x_test)
from sklearn.metrics import accuracy_score

accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1))

0.957871995679179

In [51]:
# test

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'


actions = arr
seq_length = 20


mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

cap = cv2.VideoCapture(0)


seq = []
action_seq = []

while cap.isOpened():
    ret, img = cap.read()
    img0 = img.copy()

    img = cv2.flip(img, 1)
    img = cv2.resize(img, dsize=(800,450))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    result = hands.process(img)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    if result.multi_hand_landmarks is not None:
        hand_arr = []
        right_hand, left_hand = np.zeros((21,3)), np.zeros((21,3))
        for res in result.multi_hand_landmarks:
            joint = np.zeros((21, 3))
            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z]

            # Compute angles between joints
            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3] # Parent joint
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3] # Child joint
            v = v2 - v1 # [20, 3]
            # Normalize v
            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            # Get angle using arcos of dot product
            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle) # Convert radian to degree
            angle_label = np.array(angle, dtype=np.float32)
            
            handedness_dict = MessageToDict(result.multi_handedness[0])
            if handedness_dict['classification'][0]['label'] == 'Right':
                right_hand = joint
            else:
                left_hand = joint

            
            hand_arr.extend(angle_label)
            
            mp_drawing.draw_landmarks(img, res, mp_hands.HAND_CONNECTIONS)
            
        if len(hand_arr) == 15:
            handedness_dict = MessageToDict(result.multi_handedness[0])
            if handedness_dict['classification'][0]['label'] == 'Right':
                hand_arr = np.concatenate((np.zeros(15), hand_arr))
            else:
                hand_arr = np.concatenate((hand_arr, np.zeros(15)))
        elif len(hand_arr) > 30:
            continue
            
        hand_distance = left_hand - right_hand
        hand_distance /= np.linalg.norm(hand_distance, axis=1)[:, np.newaxis]
        hand_arr = np.concatenate((hand_arr, hand_distance.flatten()))
        seq.append(hand_arr)
        
        if len(seq) < seq_length:
            continue
            
        input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)
        y_pred = model.predict(input_data).squeeze()
        i_pred = int(np.argmax(y_pred))
        conf = y_pred[i_pred]
        if conf < 0.7:
            continue
        
        action = actions[i_pred]
        action_seq.append(action)
        if len(action_seq) < 3:
            continue
        this_action = '?'
        if action_seq[-1] == action_seq[-2] == action_seq[-3]:
            this_action = action
        font = ImageFont.truetype("fonts/gulim.ttc", 20)
        img = Image.fromarray(img)
        draw = ImageDraw.Draw(img)
        draw.text((30,50), this_action, font=font, fill=(0,0,255))
        img = np.array(img)

    cv2.imshow('img', img)
    if cv2.waitKey(1) == ord('q'):
        break

            
cv2.destroyAllWindows()
cap.release()