In [None]:
# Basics
import mediapipe as mp
import cv2
import numpy as np
import time
import matplotlib.pyplot as plt
import os
from datetime import datetime, timedelta

# For modelling
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.models import load_model

# Intro: Keypoints using holisitc mediapipe model

In [None]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [None]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [None]:
def get_bbox_coords(results):
    try:
        x_min = min(res.x for res in results.right_hand_landmarks.landmark)
        y_min = min(res.y for res in results.right_hand_landmarks.landmark)
        x_max = max(res.x for res in results.right_hand_landmarks.landmark)
        y_max = max(res.y for res in results.right_hand_landmarks.landmark)
        return x_min, y_min, x_max, y_max
    except: 
        return 0, 0, 0, 0

In [None]:
def draw_bbox(image, results):
    coords = get_bbox_coords(results)
    #print(tuple(np.multiply(coords, [1920, 1080, 1920, 1080]).astype(int)))
    cv2.rectangle(image,
                  tuple(np.multiply(coords[:2], [1920, 1080]).astype(int)),
                  tuple(np.multiply(coords[2:], [1920, 1080]).astype(int)),
                  (0,0,255), 2)

In [None]:
cap = cv2.VideoCapture(1)
cv2.startWindowThread()

# Initiate holistic model
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
    
while cap.isOpened():
    # Read frame
    ret, frame = cap.read()

    # Make Detections
    image, results = mediapipe_detection(frame, holistic)

    # Draw face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks
    draw_landmarks(image, results)

    # Draw bbox
    draw_bbox(image, results)

    # Show to screen
    cv2.imshow('Video Feed', image)
    cv2.waitKey(1)

    # Break program
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
cap = cv2.VideoCapture(1)
cv2.startWindowThread()
# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened():
        # Read frame
        ret, frame = cap.read()
        
        # Make Detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks
        draw_landmarks(image, results)
        
        # Draw bbox
        draw_bbox(image, results)
        
        # Show to screen
        cv2.imshow('Video Feed', image)
        cv2.waitKey(1)

        # Break program
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()

# Define function to extract keypoint values

In [None]:
# Define function to extract all values for right hand only!
def extract_keypoints(results):
    """
    Extracts key points from results vector delivered by holistic medipipe model.
    """
    rhand = np.zeros(21*3)
    if results.right_hand_landmarks:
        rhand = np.array([[result.x, result.y, result.z] 
                          for result in results.right_hand_landmarks.landmark]).flatten()
        
    return rhand

In [None]:
# Check shape
extract_keypoints(results).shape, 21 * 3

# Setup folder structure

In [None]:
actions = ['fist', 'palm', 'index', 'ok', 'thumb_up']
imgs_per_action = 55

In [None]:
base_path = '../../data/hand_detection_mp/'

In [None]:
# Delete existing files
#for action in actions:
 #   for file in os.listdir(os.path.join(base_path, action)):
  #      os.remove(os.path.join(base_path, action, file))

# Generate data for hand gestures

In [None]:
# Take pictures for each ction and save landmarks as numpy array
cap = cv2.VideoCapture(1)
# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    for action in actions:
        for img_num in range(imgs_per_action):
            
            # Read frame
            ret, frame = cap.read()

            # Make Detections
            image, results = mediapipe_detection(frame, holistic)

            # Draw face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks
            draw_landmarks(image, results)

            # Apply wait logic
            if img_num == 0: 
                cv2.putText(image, 'STARTING COLLECTION', (100,200), 
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                cv2.putText(image, f'Collecting frames for {action}', (15,50), 
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                # Show to screen
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(3000)
            else: 
                cv2.putText(image, f'Collecting frames for {action}, ({img_num}/{imgs_per_action})', (15,50), 
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
                # Show to screen
                cv2.imshow('OpenCV Feed', image)
                cv2.waitKey(1000)

            # Export keypoints
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(base_path, action, f'keypoints_{action}_{img_num}')
            np.save(npy_path, keypoints)



            # Break program
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
                
        # Break program
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()

# Preprocess the data

In [None]:
label_map = {action:num for num, action in enumerate(actions)}
label_map

In [None]:
images, labels = [], []
for action in actions:
    for file in os.listdir(os.path.join(base_path, action)):
        res = np.load(os.path.join(base_path, action, file))
        if all(res != np.zeros(21*3)):
            images.append(res)
            labels.append(label_map[action])

In [None]:
images[0].shape, len(images), len(labels)

In [None]:
X = np.array(images)
X.shape

In [None]:
y = to_categorical(labels).astype(int)
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Create neuronal network

In [None]:
K.clear_session()

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(63,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(actions), activation='softmax'))

In [None]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
history = model.fit(X_train, y_train, validation_split = 0.1, epochs=250)

In [None]:
model.summary()

In [None]:
plt.plot(history.history['categorical_accuracy'], label = 'train_data')
plt.plot(history.history['val_categorical_accuracy'], label = 'validation_data')
plt.legend()

In [None]:
plt.plot(history.history['loss'], label = 'train_data')
plt.plot(history.history['val_loss'], label = 'validation_data')
plt.legend()

# Evaluate model

In [None]:
# Show score for train and test data
model.evaluate(X_train, y_train), model.evaluate(X_test, y_test)

In [None]:
# Determine predictions for test data
y_pred = model.predict(X_test)

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_true=np.argmax(y_test, axis=1), y_pred=np.argmax(y_pred, axis=1))

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=actions)

fig, ax = plt.subplots(figsize=(10,10))

disp.plot(ax=ax);

In [None]:
model.save('hand_gesture_model.h5')

# Realtime gesture detection

In [None]:
# Load model from h5 file hand_detection_model.ipynbre for testing purposes)
model = load_model('hand_gesture_model.h5')

In [None]:
cap = cv2.VideoCapture(1)
cv2.startWindowThread()

# Initialize variables
datetime_prv = datetime.now() - timedelta(seconds=1)
class_prob = 0

# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened():
        # Read frame
        ret, frame = cap.read()
        
        # Make Detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks
        draw_landmarks(image, results)
        
        # NEW predict hand gesture
        if datetime.now() >= datetime_prv + timedelta(seconds=0.5) and results.right_hand_landmarks:
            img = image
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            keypoints = extract_keypoints(results) 
            keypoints = keypoints.reshape(1,-1)
            y_pred = model.predict(keypoints)
            class_id = np.argmax(y_pred)
            class_prob = np.max(y_pred)
            coords = get_bbox_coords(results)
            
        # NEW draw handgesture if detetced
        if class_prob > 0.5 and results.right_hand_landmarks:
        
            # Draw bbox around hand
            draw_bbox(image, results)

            # Draw label box
            coords = get_bbox_coords(results)
            cv2.rectangle(image,
                          tuple(np.add(np.multiply(coords[:2], [1920, 1080]).astype(int), [0, -30])),
                          tuple(np.add(np.multiply(coords[:2], [1920, 1080]).astype(int), [80, 0])),
                          (0,0,255), -1)

            # Put text in label
            cv2.putText(image, f'{actions[class_id]}', 
                        tuple(np.add(np.multiply(coords[:2], [1920, 1080]).astype(int), [0, -5])),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('Video Feed', image)
        cv2.waitKey(1)

        # Break program
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()

In [None]:
# Function that wraps everything for drone operations
def model_full(image, model, res):
    
    # Predict hand gesture, i.d. determine class_id with max proba and probability 
    image, results = mediapipe_detection(frame, model)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    keypoints = extract_keypoints(results)
    keypoints = keypoints.reshape(1,-1)
    
    y_pred = model.predict(keypoints)
    class_id = np.argmax(y_pred)
    class_prob = np.max(y_pred)
    
    
    # Calculate bbox params -> center_point and area for tracking
    coords = get_bbox_coords(results)
    coords = tuple(np.multiply(coords, [res[0], res[1], res[0], res[1]]).astype(int))
    
    center = (coords[0] + coords[2]) // 2, (coords[1] + coords[3]) // 2
    area = (coords[2] - coords[0]) * (coords[3] - coords[1]) // 1
    
    return (class_id, class_prob), (center, area)