In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)
# mp_drawing = mp.solutions.drawing_utils  # Import the drawing utilities

# Define your class names based on the directory structure
class_names = os.listdir('dataset')

# List to store sequences of landmarks and corresponding labels
all_landmarks_sequences = []
all_labels = []

for class_name in class_names:
    class_directory = os.path.join('dataset', class_name)
    video_files = os.listdir(class_directory)
    print(class_directory)
    
    for video_file in video_files:
        video_path = os.path.join(class_directory, video_file)
        
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        landmarks_sequence = []  # List to store sequences of landmarks for the current video
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            # Convert the frame to RGB and process it with MediaPipe Hands
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(rgb_frame)
            
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Convert the NormalizedLandmark objects to numerical representation
                    landmarks_numeric = np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks.landmark])
                    
                    # Store the hand landmarks in the sequence
                    landmarks_sequence.append(landmarks_numeric)
                
                # Draw landmarks on the frame
#                 for hand_landmarks in results.multi_hand_landmarks:
#                     mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Display the frame with or without landmarks
#             cv2.imshow('Hand Gestures', frame)
            
#             if cv2.waitKey(1) & 0xFF == ord('q'):
#                 break
        
        # Release the capture and close the window
        cap.release()
        # cv2.destroyAllWindows()
        
        # Convert the landmarks sequence to a numpy array and append to all_landmarks_sequences
        all_landmarks_sequences.append(np.array(landmarks_sequence))
        # Append the corresponding label for this video
        all_labels.append(class_name)

# all_landmarks_sequences now contains the sequences of landmarks from all videos
# all_labels contains the corresponding labels for each video

# You can then use all_landmarks_sequences and all_labels as input to your LSTM model


dataset\accident
dataset\call
dataset\doctor
dataset\help
dataset\hot
dataset\lose
dataset\pain
dataset\thief


In [3]:
print(all_landmarks_sequences)

[array([[[ 6.11661613e-01,  5.56410968e-01, -7.77883855e-08],
        [ 5.74619532e-01,  4.96822566e-01, -3.33774351e-02],
        [ 5.36894500e-01,  4.79425609e-01, -5.30877821e-02],
        ...,
        [ 5.40468395e-01,  7.63139606e-01, -3.96992639e-02],
        [ 5.33113778e-01,  8.01465869e-01, -4.67492081e-02],
        [ 5.29700756e-01,  8.36948097e-01, -5.10624312e-02]],

       [[ 2.26993099e-01,  5.27400196e-01, -4.91309549e-07],
        [ 2.81688392e-01,  4.62135732e-01,  5.30224573e-03],
        [ 3.33680540e-01,  4.54467177e-01,  5.97620150e-03],
        ...,
        [ 3.52259755e-01,  6.48450136e-01,  5.40488493e-03],
        [ 3.45640421e-01,  6.25937104e-01,  1.58901308e-02],
        [ 3.30402881e-01,  6.20105267e-01,  2.28922963e-02]],

       [[ 6.09202385e-01,  5.58536172e-01,  1.37505737e-07],
        [ 5.73884368e-01,  4.96845216e-01, -4.12765555e-02],
        [ 5.38347244e-01,  4.86720383e-01, -6.46886602e-02],
        ...,
        [ 5.41963220e-01,  7.54947424e-01

In [4]:
# Assuming max_sequence_length is the desired sequence length
max_sequence_length = 50  # Change this to your desired value - no of frames

# Preprocess the landmarks sequences to have a consistent length
processed_landmarks_sequences = []
num_landmarks = 21

for landmarks_sequence in all_landmarks_sequences:
    # Truncate or pad the sequence to match max_sequence_length
    if len(landmarks_sequence) >= max_sequence_length:
        processed_landmarks_sequences.append(landmarks_sequence[:max_sequence_length])
    else:
        padding = [np.zeros(num_landmarks)] * (max_sequence_length - len(landmarks_sequence))
        processed_landmarks_sequences.append(landmarks_sequence + padding)


# Assuming landmarks_sequence is a list of NormalizedLandmark objects
# Convert the landmarks to a numerical format
# def convert_landmarks_to_numerical(landmarks_sequence):
#     numeric_landmarks_sequence = []

#     for landmarks_frame in landmarks_sequence:
#         landmarks_frame_numeric = np.array([[landmark.x, landmark.y, landmark.z] for landmark in landmarks_frame])
#         numeric_landmarks_sequence.append(landmarks_frame_numeric)

#     # Convert the numeric landmarks sequence to a numpy array
#     X = np.array(numeric_landmarks_sequence)
#     return X

# all_landmarks_sequences = convert_landmarks_to_numerical(all_landmarks_sequences)
# all_labels = convert_landmarks_to_numerical(all_labels)


# Convert the processed sequences to numpy array
all_landmarks_sequences = np.array(processed_landmarks_sequences)
all_labels = np.array(all_labels)

In [5]:
print(all_landmarks_sequences.shape)
print(all_labels.shape)

(412, 50, 21, 3)
(412,)


In [6]:
# Save the preprocessed data
np.save('all_landmarks_sequences.npy', all_landmarks_sequences)
np.save('all_labels.npy', all_labels)

In [4]:
import numpy as np

# Load the preprocessed data
all_landmarks_sequences = np.load('all_landmarks_sequences.npy', allow_pickle=True)
all_labels = np.load('all_labels.npy', allow_pickle=True)


In [5]:
# Assuming all_landmarks_sequences has shape (num_samples, sequence_length, num_landmarks, num_coordinates)
# You need to reshape it to (num_samples, sequence_length, num_landmarks * num_coordinates)
all_landmarks_sequences = all_landmarks_sequences.reshape(
    all_landmarks_sequences.shape[0],
    all_landmarks_sequences.shape[1],
    all_landmarks_sequences.shape[2] * all_landmarks_sequences.shape[3]
)

# Continue with the rest of your code


In [6]:
print(all_landmarks_sequences.shape)
print(all_labels.shape)

(412, 50, 63)
(412,)


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Assuming all_landmarks_sequences contains sequences of landmarks and all_labels contains corresponding labels
# all_landmarks_sequences.shape: (num_samples, sequence_length, num_landmarks)
# all_labels.shape: (num_samples,)

# Convert labels to numerical values
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(all_labels)

# One-hot encode the numeric labels
onehot_encoder = OneHotEncoder(sparse=False)
onehot_labels = onehot_encoder.fit_transform(numeric_labels.reshape(-1, 1))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_landmarks_sequences, onehot_labels, test_size=0.2)


joblib.dump(label_encoder, 'label_encoder.joblib')




['label_encoder.joblib']

In [22]:
print('X_train : ', len(X_train))
print('y_train : ', len(y_train))
print('\nX_test : ', len(X_test))
print('y_test : ', len(y_test))

X_train :  329
y_train :  329

X_test :  83
y_test :  83


In [23]:
print(X_train.shape)

(329, 50, 63)


In [24]:


# Define the LSTM model
model = Sequential([
    LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    LSTM(units=64),
    Dense(onehot_labels.shape[1], activation='softmax')  # Use the number of classes for the output dimension
])

In [25]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x274ecd0af20>

In [26]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 50, 128)           98304     
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 8)                 520       
                                                                 
Total params: 148,232
Trainable params: 148,232
Non-trainable params: 0
_________________________________________________________________


In [27]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy}')


Test Accuracy: 0.8192771077156067


In [28]:
model.save('gesture_classification.keras')

In [8]:
import cv2
import mediapipe as mp
import numpy as np
from keras.models import load_model

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.6, min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # Import the drawing utilities

# Load the trained LSTM model
model = load_model('gesture_classification.keras')  # Replace with the path to your trained model

# Open the video file
video_path = 'hot1.mp4'
cap = cv2.VideoCapture(video_path)

landmarks_sequence = []  # List to store sequences of landmarks

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Convert the frame to RGB and process it with MediaPipe Hands
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Convert the NormalizedLandmark objects to numerical representation
            landmarks_numeric = np.array([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks.landmark])
            
            # Store the hand landmarks in the sequence
            landmarks_sequence.append(landmarks_numeric)
            
        # Draw landmarks on the frame
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    
    # Display the frame with or without landmarks
    cv2.imshow('Hand Gestures', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close all windows
cap.release()
cv2.destroyAllWindows()


# Assuming max_sequence_length is the desired sequence length
max_sequence_length = 50  # Change this to your desired value - no of frames

# Preprocess the landmarks sequences to have a consistent length
num_landmarks = 21

# Truncate or pad the sequence to match max_sequence_length
if len(landmarks_sequence) >= max_sequence_length:
    landmarks_sequence = landmarks_sequence[:max_sequence_length]
else:
    padding = [np.zeros(num_landmarks)] * (max_sequence_length - len(landmarks_sequence))
    landmarks_sequence = landmarks_sequence + padding

# Convert the landmarks sequence to a numpy array
landmarks_sequence = np.array(landmarks_sequence)

print(landmarks_sequence.shape)

# Assuming all_landmarks_sequences has shape (num_samples, sequence_length, num_landmarks, num_coordinates)
# You need to reshape it to (num_samples, sequence_length, num_landmarks * num_coordinates)
landmarks_sequence = landmarks_sequence.reshape(
    1,
    landmarks_sequence.shape[0],
    landmarks_sequence.shape[1] * landmarks_sequence.shape[2]
)

# Use the trained model to predict
# Ensure that you preprocess the landmarks_sequence similarly to how you preprocessed during training
# predicted_probs = model.predict(np.expand_dims(landmarks_sequence, axis=0))

print(landmarks_sequence.shape)

predicted_probs = model.predict(landmarks_sequence)
print(predicted_probs)

# Get the class with the highest probability as the predicted gesture
predicted_class = np.argmax(predicted_probs)

label_encoder = joblib.load('label_encoder.joblib')

# Decode the numeric label back to the original class label
predicted_label = label_encoder.inverse_transform([predicted_class])[0]

print("Predicted Gesture:", predicted_label)

# ValueError: could not broadcast input array from shape (21,3) into shape (21,)
# This error occurs if hands are not detected correctly


(50, 21, 3)
(1, 50, 63)
[[0.00827048 0.00841927 0.01746368 0.00210104 0.8540393  0.02059546
  0.06660526 0.02250554]]
Predicted Gesture: hot
