In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

2. Keypoints using MP Holistic

In [2]:
mp_holistic_model = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 

In [3]:
def mediapipe_detection(frame , model):
    frame  = cv2.cvtColor(frame , cv2.COLOR_BGR2RGB) # model expect RGB 
    frame.flags.writeable = False # save some memory while processing
    results = model.process(frame) # making prediction
    frame.flags.writeable = True
    frame = cv2.cvtColor(frame , cv2.COLOR_RGB2BGR) # converting back to BGR
    return frame , results


In [4]:
def draw_landmarks(frame, results):
    # Draw face connections
    mp_drawing.draw_landmarks(frame, results.face_landmarks, mp_holistic_model.FACEMESH_TESSELATION)
    # Draw pose connections
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic_model.POSE_CONNECTIONS) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS)
    # Draw right hand connections
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS)


In [5]:
def draw_styled_landmarsks(frame, results):
    # Draw face connections
    mp_drawing.draw_landmarks(frame, results.face_landmarks, mp_holistic_model.FACEMESH_TESSELATION, 
                            #joint color , thickness , circle radius
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                            #lines color , thinkness
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1)
                            ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic_model.POSE_CONNECTIONS,
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )  
    # Draw left hand connections
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS, 
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )   
    # Draw right hand connections  
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS, 
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )  

Here is the explaination of what the following block of code is doing
-   we capture the frame using webcamp
-   we pass it to the holistic model to get keypoint
-   we pass tha image and the key points to function that draw these keypoint on the image

3. Extract Keypoint Values

In [6]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

4. Setup Folders for Collection

In [7]:
#that is the path for exported data
DATA_PATH = os.path.join('MP_Data')
#actions we are tring to predict
actions = np.array(['hello' , 'thanks' , 'money' , 'nothing'])
#thirty video worth of data
no_sequences = 30
#videos are going to be 30 frames
sequences_length = 30

5. Collect Keypoint Values for Training and Testing

6. Preprocess Data and Create Labels and Features

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [9]:
label_map = {label:num for num , label in enumerate(actions)}

In [10]:
label_map

{'hello': 0, 'thanks': 1, 'money': 2, 'nothing': 3}

In [11]:
sequences , labels = [] , []
for action in actions :
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequences_length):
            res = np.load(os.path.join(DATA_PATH , action , str(sequence) , '{}.npy'.format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])


In [12]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf


model = Sequential([
  LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)),
  LSTM(128, return_sequences=True, activation='relu'),
  LSTM(64, return_sequences=False, activation='relu'),
  Dense(64, activation='relu'),
  Dense(32, activation='relu'),
  Dense(4, activation='softmax')])
adam = tf.keras.optimizers.Adam(
    learning_rate=0.0001, # that small learning rate is very important
    beta_1=0.9,
    beta_2=0.999)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [23]:
model.fit(X, y, epochs=250,  batch_size=10 , verbose=2)

Epoch 1/250
12/12 - 2s - loss: 0.7685 - categorical_accuracy: 0.6917 - 2s/epoch - 192ms/step
Epoch 2/250
12/12 - 1s - loss: 0.8240 - categorical_accuracy: 0.6583 - 568ms/epoch - 47ms/step
Epoch 3/250
12/12 - 1s - loss: 1.1377 - categorical_accuracy: 0.5333 - 568ms/epoch - 47ms/step
Epoch 4/250
12/12 - 1s - loss: 0.8971 - categorical_accuracy: 0.5667 - 583ms/epoch - 49ms/step
Epoch 5/250
12/12 - 1s - loss: 0.7977 - categorical_accuracy: 0.6667 - 582ms/epoch - 48ms/step
Epoch 6/250
12/12 - 1s - loss: 0.6858 - categorical_accuracy: 0.7000 - 569ms/epoch - 47ms/step
Epoch 7/250
12/12 - 1s - loss: 0.6156 - categorical_accuracy: 0.7167 - 583ms/epoch - 49ms/step
Epoch 8/250
12/12 - 1s - loss: 0.4813 - categorical_accuracy: 0.8083 - 572ms/epoch - 48ms/step
Epoch 9/250
12/12 - 1s - loss: 0.7393 - categorical_accuracy: 0.6500 - 578ms/epoch - 48ms/step
Epoch 10/250
12/12 - 1s - loss: 0.6412 - categorical_accuracy: 0.7500 - 603ms/epoch - 50ms/step
Epoch 11/250
12/12 - 1s - loss: 0.4861 - categorica

Epoch 87/250
12/12 - 1s - loss: 0.0726 - categorical_accuracy: 0.9667 - 621ms/epoch - 52ms/step
Epoch 88/250
12/12 - 1s - loss: 0.0339 - categorical_accuracy: 0.9917 - 633ms/epoch - 53ms/step
Epoch 89/250
12/12 - 1s - loss: 0.0215 - categorical_accuracy: 1.0000 - 630ms/epoch - 52ms/step
Epoch 90/250
12/12 - 1s - loss: 0.0178 - categorical_accuracy: 1.0000 - 623ms/epoch - 52ms/step
Epoch 91/250
12/12 - 1s - loss: 0.0157 - categorical_accuracy: 1.0000 - 617ms/epoch - 51ms/step
Epoch 92/250
12/12 - 1s - loss: 0.0106 - categorical_accuracy: 1.0000 - 616ms/epoch - 51ms/step
Epoch 93/250
12/12 - 1s - loss: 0.0130 - categorical_accuracy: 1.0000 - 630ms/epoch - 52ms/step
Epoch 94/250
12/12 - 1s - loss: 0.0414 - categorical_accuracy: 0.9833 - 623ms/epoch - 52ms/step
Epoch 95/250
12/12 - 1s - loss: 0.0510 - categorical_accuracy: 0.9833 - 616ms/epoch - 51ms/step
Epoch 96/250
12/12 - 1s - loss: 0.0264 - categorical_accuracy: 0.9833 - 629ms/epoch - 52ms/step
Epoch 97/250
12/12 - 1s - loss: 0.0233 -

KeyboardInterrupt: 

In [15]:
model.save('action.h5')

7. calling the model and start testing

In [16]:
from tensorflow import keras
model = keras.models.load_model('action.h5')

In [21]:
colors = (16 , 117 , 245)
def prob_viz(res , actions , input_frame , colors):
    output_frame = input_frame.copy()
    for num , prob in enumerate(res):
        cv2.rectangle(output_frame , (0 , 60 + num * 40 ) , #start_point : represents the top left corner of rectangle
            (int(prob * 100) ,  90 + num* 40 ), #end_point : represents the bottom right corner of rectangle
            color , -1
                )
        cv2.putText(output_frame, actions[num]
            , (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX
            , 1, (255,255,255), 2, cv2.LINE_AA)
    return output_frame


In [24]:
# detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.95

cap = cv2.VideoCapture(0)
# Getting the settings of our video capture
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

writ = cv2.VideoWriter('keyPointRecognition.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))

# Set mediapipe model 
with mp_holistic_model.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarsks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0 :
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])
                    
            if len(sentence) > 5 :
                sentence = sentence[-5 :]
            image = prob_viz(res , actions , image , colors)
        
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)
        writ.write(image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    writ.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
nothing
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.SolutionOutputs'>
money
<class 'mediapipe.python.solution_base.Solu