In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

2. Keypoints using MP Holistic

In [2]:
mp_holistic_model = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 

In [3]:
def mediapipe_detection(frame , model):
    frame  = cv2.cvtColor(frame , cv2.COLOR_BGR2RGB) # model expect RGB 
    frame.flags.writeable = False # save some memory while processing
    results = model.process(frame) # making prediction
    frame.flags.writeable = True
    frame = cv2.cvtColor(frame , cv2.COLOR_RGB2BGR) # converting back to BGR
    return frame , results


In [4]:
def draw_landmarks(frame, results):
    # Draw face connections
    mp_drawing.draw_landmarks(frame, results.face_landmarks, mp_holistic_model.FACEMESH_TESSELATION)
    # Draw pose connections
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic_model.POSE_CONNECTIONS) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS)
    # Draw right hand connections
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS)


In [5]:
def draw_styled_landmarsks(frame, results):
    # Draw face connections
    mp_drawing.draw_landmarks(frame, results.face_landmarks, mp_holistic_model.FACEMESH_TESSELATION, 
                            #joint color , thickness , circle radius
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                            #lines color , thinkness
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=1)
                            ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic_model.POSE_CONNECTIONS,
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )  
    # Draw left hand connections
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS, 
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )   
    # Draw right hand connections  
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic_model.HAND_CONNECTIONS, 
                            mp_drawing.DrawingSpec(color=(80,110,10), thickness=2, circle_radius=4), 
                            mp_drawing.DrawingSpec(color=(80,256,121), thickness=2)
                            )  

Here is the explaination of what the following block of code is doing
-   we capture the frame using webcamp
-   we pass it to the holistic model to get keypoint
-   we pass tha image and the key points to function that draw these keypoint on the image

In [6]:
cap = cv2.VideoCapture(0)
#so here we are setting mpmodel
with mp_holistic_model.Holistic(min_detection_confidence=0.5 , min_tracking_confidence=0.5) as holistic :
    while cap.isOpened():
        ret , frame = cap.read()
        
        # making detection
        frame , results = mediapipe_detection(frame , model=holistic)
        # drawing landmarks
        draw_styled_landmarsks(frame , results)
        #showing to the screen
        cv2.imshow('openCV read' , frame)

        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

3. Extract Keypoint Values

In [7]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [8]:
# extracting the landmarks in one big array 
results_test = extract_keypoints(results=results)
# the number of landmarks we get for each frame
print(results_test.shape)

(1662,)


4. Setup Folders for Collection

In [9]:
#that is the path for exported data
DATA_PATH = os.path.join('MP_Data')
#actions we are tring to predict
actions = np.array(['hello' , 'thanks' , 'money' , 'nothing'])
#thirty video worth of data
no_sequences = 30
#videos are going to be 30 frames
sequences_length = 30

In [10]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH , action , str(sequence)))
        except:
            pass

5. Collect Keypoint Values for Training and Testing

In [13]:
cap = cv2.VideoCapture(0)
#so here we are setting mpmodel
with mp_holistic_model.Holistic(min_detection_confidence=0.5 , min_tracking_confidence=0.5) as holistic :
    #looping through actions
    for action in actions:
        #vidos for each action
        for sequence in range(no_sequences):
            #frames for each video of each action
            for frame_num in range(sequences_length):
                ret  , frame = cap.read()
                # making detection
                frame , results = mediapipe_detection(frame , model=holistic)
                # drawing landmarks
                draw_styled_landmarsks(frame , results)

                if frame_num == 0:
                    cv2.putText(frame, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(frame, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed' , frame)
                    #that break is for each video of each actions 
                    cv2.waitKey(2000)

                else :
                    cv2.putText(frame, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', frame)

                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH , action , str(sequence) , str(frame_num))
                np.save(npy_path , keypoints)

                if cv2.waitKey(10) & 0xff == ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()

6. Preprocess Data and Create Labels and Features

In [20]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [21]:
label_map = {label:num for num , label in enumerate(actions)}

In [25]:
sequences , labels = [] , []
for action in actions :
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequences_length):
            res = np.load(os.path.join(DATA_PATH , action , str(sequence) , '{}.npy'.format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])


In [30]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

7. Build and Train LSTM Neural Network

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard