In [1]:
import cv2
import mediapipe as mp
import numpy as np
from keras.models import load_model
import joblib
import os
import shutil

In [26]:


def run_model_by_video(video_path):

    # Initialize MediaPipe Hands
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)
    mp_drawing = mp.solutions.drawing_utils  # Import the drawing utilities

    # Load the trained LSTM model
    model = load_model('gesture_classification.keras')  # Replace with the path to your trained model

    # Open the video file
    cap = cv2.VideoCapture(video_path)

    landmarks_sequence = []  # List to store sequences of landmarks

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the frame to RGB and process it with MediaPipe Hands
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Convert the NormalizedLandmark objects to numerical representation
                landmarks_numeric = np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks.landmark])

                # Store the hand landmarks in the sequence
                landmarks_sequence.append(landmarks_numeric)

            # Draw landmarks on the frame
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Display the frame with or without landmarks
        cv2.imshow('Hand Gestures', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the capture and close all windows
    cap.release()
    cv2.destroyAllWindows()


    # Assuming max_sequence_length is the desired sequence length
    max_sequence_length = 50  # Change this to your desired value - no of frames

    # Preprocess the landmarks sequences to have a consistent length
    num_landmarks = 21

    # Truncate or pad the sequence to match max_sequence_length
    if len(landmarks_sequence) >= max_sequence_length:
        landmarks_sequence = landmarks_sequence[:max_sequence_length]
    else:
        num_landmarks = 21
        num_coordinates = 3
        padding = np.zeros((max_sequence_length - len(landmarks_sequence), num_landmarks, num_coordinates))
        landmarks_sequence = np.concatenate((landmarks_sequence, padding), axis=0)

    # Convert the landmarks sequence to a numpy array
    landmarks_sequence = np.array(landmarks_sequence)

    print(landmarks_sequence.shape)

    # Assuming all_landmarks_sequences has shape (num_samples, sequence_length, num_landmarks, num_coordinates)
    # You need to reshape it to (num_samples, sequence_length, num_landmarks * num_coordinates)
    landmarks_sequence = landmarks_sequence.reshape(
        1,
        landmarks_sequence.shape[0],
        landmarks_sequence.shape[1] * landmarks_sequence.shape[2]
    )
    # landmarks_sequence.shape[0] = frame size
    # landmarks_sequence.shape[1] = 21 (no of landmarks of hands)
    # landmarks_sequence.shape[2] = x, y, z co-ordinates

    # Use the trained model to predict
    # Ensure that you preprocess the landmarks_sequence similarly to how you preprocessed during training
    # predicted_probs = model.predict(np.expand_dims(landmarks_sequence, axis=0))

    print(landmarks_sequence.shape)

    predicted_probs = model.predict(landmarks_sequence)
    print(predicted_probs)

    # Get the class with the highest probability as the predicted gesture
    predicted_class = np.argmax(predicted_probs)

    label_encoder = joblib.load('label_encoder.joblib')

    # Decode the numeric label back to the original class label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]

    print("Predicted Gesture:", predicted_label)




In [3]:


def run_model_by_frames(frames):

    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)
    mp_drawing = mp.solutions.drawing_utils  # Import the drawing utilities
    
    # Load the trained LSTM model
    model = load_model('gesture_classification.keras')

    landmarks_sequence = []  # List to store sequences of landmarks

    for frame in frames:
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                landmarks_numeric = np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks.landmark])
                landmarks_sequence.append(landmarks_numeric)
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        
        # Display the frame with landmarks
        cv2.imshow('Hand Gestures', frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cv2.destroyAllWindows()


    # Assuming max_sequence_length is the desired sequence length
    max_sequence_length = 50  # Change this to your desired value - no of frames

    # Preprocess the landmarks sequences to have a consistent length
    num_landmarks = 21

    # Truncate or pad the sequence to match max_sequence_length
    if len(landmarks_sequence) >= max_sequence_length:
        landmarks_sequence = landmarks_sequence[:max_sequence_length]
    else:
        num_landmarks = 21
        num_coordinates = 3
        padding = np.zeros((max_sequence_length - len(landmarks_sequence), num_landmarks, num_coordinates))
        landmarks_sequence = np.concatenate((landmarks_sequence, padding), axis=0)

    # Convert the landmarks sequence to a numpy array
    landmarks_sequence = np.array(landmarks_sequence)

    print(landmarks_sequence.shape)

    # Assuming all_landmarks_sequences has shape (num_samples, sequence_length, num_landmarks, num_coordinates)
    # You need to reshape it to (num_samples, sequence_length, num_landmarks * num_coordinates)
    landmarks_sequence = landmarks_sequence.reshape(
        1,
        landmarks_sequence.shape[0],
        landmarks_sequence.shape[1] * landmarks_sequence.shape[2]
    )
    # landmarks_sequence.shape[0] = frame size
    # landmarks_sequence.shape[1] = 21 (no of landmarks of hands)
    # landmarks_sequence.shape[2] = x, y, z co-ordinates

    # Use the trained model to predict
    # Ensure that you preprocess the landmarks_sequence similarly to how you preprocessed during training
    # predicted_probs = model.predict(np.expand_dims(landmarks_sequence, axis=0))

    print(landmarks_sequence.shape)

    predicted_probs = model.predict(landmarks_sequence)
    print(predicted_probs)

    # Get the class with the highest probability as the predicted gesture
    predicted_class = np.argmax(predicted_probs)

    label_encoder = joblib.load('label_encoder.joblib')

    # Decode the numeric label back to the original class label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]

    print("Predicted Gesture:", predicted_label)




In [32]:

# Separate the segmented videos

# Rum time - 62 sec

# Initialize Mediapipe hands module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)

# Load the input video
video_path = 'multiclass.mp4'
cap = cv2.VideoCapture(video_path)

# Create a folder for saving gesture videos
output_folder = 'gestures'

if os.path.exists(output_folder):
    try:
        shutil.rmtree(output_folder)
        print(f"Contents of '{output_folder}' removed successfully.")
    except Exception as e:
        print(f"Error removing contents of '{output_folder}': {e}")
else:
    print(f"'{output_folder}' does not exist.")
    os.makedirs(output_folder)

# Initialize variables for gesture segmentation
frame_buffer = []
gesture_frames = []
gesture_detected = False
min_gesture_frames = 30  # Minimum number of frames to consider as a gesture
gesture_count = 0

# Output video writer
output_video_writer = None

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the BGR frame to RGB for Mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Detect hands in the frame
    results = hands.process(rgb_frame)

    # Check if hands are detected
    if results.multi_hand_landmarks:
        if not gesture_detected:
            gesture_detected = True
        frame_buffer.append(frame)
    else:
        if gesture_detected and len(frame_buffer) >= min_gesture_frames:
            gesture_frames = frame_buffer.copy()

            # Save the gesture_frames as a separate video
            gesture_count += 1
            output_path = os.path.join(output_folder, f'gesture_{gesture_count}.avi')
            if output_video_writer is None:
                height, width, _ = gesture_frames[0].shape
                output_video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'XVID'), 30, (width, height))
            for gesture_frame in gesture_frames:
                output_video_writer.write(gesture_frame)

            # Release the output video writer
            output_video_writer.release()
            output_video_writer = None

        gesture_detected = False
        frame_buffer = []

cap.release()
cv2.destroyAllWindows()


In [29]:
# Concurrent threads for video writting and frames extraction

# Run time - 60 sec

import cv2
import os
import mediapipe as mp
import concurrent.futures

# Initialize Mediapipe hands module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)

# Load the input video
video_path = 'multiclass.mp4'
cap = cv2.VideoCapture(video_path)

# Create a folder for saving gesture videos
output_folder = 'gestures'

if os.path.exists(output_folder):
    try:
        shutil.rmtree(output_folder)
        print(f"Contents of '{output_folder}' removed successfully.")
    except Exception as e:
        print(f"Error removing contents of '{output_folder}': {e}")
else:
    print(f"'{output_folder}' does not exist.")
    os.makedirs(output_folder)

# Initialize variables for gesture segmentation
frame_indices = []
gesture_detected = False
min_gesture_frames = 30  # Minimum number of frames to consider as a gesture

# Output video writers
output_video_writers = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the BGR frame to RGB for Mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Detect hands in the frame
    results = hands.process(rgb_frame)

    # Check if hands are detected
    if results.multi_hand_landmarks:
        if not gesture_detected:
            gesture_detected = True
            start_frame = cap.get(cv2.CAP_PROP_POS_FRAMES) - 1
    else:
        if gesture_detected:
            end_frame = cap.get(cv2.CAP_PROP_POS_FRAMES) - 1
            num_frames = end_frame - start_frame
            if num_frames >= min_gesture_frames:
                frame_indices.append((int(start_frame), int(end_frame)))
            gesture_detected = False

# Process gesture frames concurrently
def process_gesture(gesture_frames, output_path):
    height, width, _ = gesture_frames[0].shape
    output_video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'XVID'), 30, (width, height))
    for gesture_frame in gesture_frames:
        output_video_writer.write(gesture_frame)
    output_video_writer.release()

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for start_frame, end_frame in frame_indices:
        gesture_frames = []
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        for _ in range(end_frame - start_frame + 1):
            ret, frame = cap.read()
            if not ret:
                break
            gesture_frames.append(frame)
        output_path = os.path.join(output_folder, f'gesture_{len(futures) + 1}.avi')
        futures.append(executor.submit(process_gesture, gesture_frames, output_path))

# Release resources
cap.release()
cv2.destroyAllWindows()


In [33]:
import os

video_paths = os.listdir('gestures')
for video in video_paths:
    print(video)
    run_model_by_video('gestures/'+video)

gesture_1.avi
(50, 21, 3)
(1, 50, 63)
[[0.00959661 0.00557783 0.00600773 0.00147017 0.8236088  0.00692855
  0.13257937 0.01423097]]
Predicted Gesture: hot
gesture_2.avi
(50, 21, 3)
(1, 50, 63)
[[0.03237595 0.30407634 0.09674851 0.03742347 0.08001267 0.15176386
  0.2673899  0.03020922]]
Predicted Gesture: call
gesture_3.avi
(50, 21, 3)
(1, 50, 63)
[[0.01526649 0.51758665 0.0360455  0.01415846 0.03755576 0.06115115
  0.29816326 0.02007264]]
Predicted Gesture: call
gesture_4.avi
(50, 21, 3)
(1, 50, 63)
[[0.63244915 0.02498532 0.03682843 0.05022684 0.04561914 0.02656052
  0.02094135 0.16238932]]
Predicted Gesture: accident
thief_001_02.AVI
(50, 21, 3)
(1, 50, 63)
[[0.05628915 0.022482   0.00686561 0.00338208 0.3042078  0.00885591
  0.02008308 0.5778343 ]]
Predicted Gesture: thief


In [28]:
run_model_by_video('gestures/gesture_4.avi')

(50, 21, 3)
(1, 50, 63)
[[0.63244915 0.02498532 0.03682843 0.05022684 0.04561914 0.02656052
  0.02094135 0.16238932]]
Predicted Gesture: accident


In [5]:

# Get the frames of segmented gestures and get those landmarks

# Run time - 50 sec

# Initialize Mediapipe hands module
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)

# Load the input video
video_path = 'multiclass.mp4'
cap = cv2.VideoCapture(video_path)

# Initialize variables for gesture segmentation
frame_buffer = []
gesture_frames = []
gesture_detected = False
min_gesture_frames = 30  # Minimum number of frames to consider as a gesture
gesture_count = 0


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the BGR frame to RGB for Mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Detect hands in the frame
    results = hands.process(rgb_frame)

    # Check if hands are detected
    if results.multi_hand_landmarks:
        if not gesture_detected:
            gesture_detected = True
        frame_buffer.append(frame)
    else:
        if gesture_detected and len(frame_buffer) >= min_gesture_frames:
            gesture_frames = frame_buffer.copy()  # one gesture video
            run_model_by_frames(gesture_frames)


        gesture_detected = False
        frame_buffer = []

cap.release()
cv2.destroyAllWindows()


KeyboardInterrupt: 