In [1]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
import time
import mediapipe as mp
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

KERAS_MODEL_PATH = 'C:\\Users\\Marco\\Downloads\\Telegram Desktop\\model.keras'
TFLITE_MODEL_PATH = '/Users/sebastianosanson/Development/VCS_Project/dynamic-hgr/transformed_pose_saved_model.tflite'

In [2]:
# For pose landmarks

MARGIN = 100  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54)  # vibrant green

In [3]:
# --- Define Label Mapping ---
label_mapping = {
    'scroll_down': 0,
    'scroll_left': 1,
    'scroll_right': 2,
    'scroll_up': 3,
    'zoom_in': 4,
    'zoom_out': 5
}
# Create a reverse mapping for display purposes: index -> label
index_to_label = {v: k for k, v in label_mapping.items()}

# --- Load Models ---
# Load InceptionV3 as a feature extractor (for converting frames to 2048-dim feature vectors)
feature_extractor = keras.applications.InceptionV3(
    weights='imagenet',
    include_top=False,
    pooling='avg'  # Global average pooling to get a 2048-dim feature vector
)
# Expected input size for InceptionV3
INCEPTION_SIZE = (75, 75)

In [4]:
def draw_pose_landmarks_on_image(rgb_image, detection_result):
  pose_landmarks_list = detection_result.pose_landmarks
  annotated_image = np.copy(rgb_image)
  x_coordinates = []
  y_coordinates = []

  # Check if any pose landmarks were detected
  if pose_landmarks_list:
    # Loop through the detected poses to visualize.
    for idx in range(len(pose_landmarks_list)):
      pose_landmarks = pose_landmarks_list[idx]

      # Draw the pose landmarks.
      pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
      pose_landmarks_proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in pose_landmarks
      ])
      solutions.drawing_utils.draw_landmarks(
        annotated_image,
        pose_landmarks_proto,
        solutions.pose.POSE_CONNECTIONS,
        solutions.drawing_styles.get_default_pose_landmarks_style())

      # Get the coordinates of all the keypoints
      height, width, _ = annotated_image.shape
      x_coordinates += [int(landmark.x * width) for landmark in pose_landmarks]
      y_coordinates += [int(landmark.y * height) for landmark in pose_landmarks]

    return annotated_image, x_coordinates, y_coordinates
  else:
    # Return default values or handle the case when no landmarks are found
    return rgb_image, [], []  # Return original image and empty lists for x, y

In [5]:
def crop(rgb_image, x_coordinates, y_coordinates):

    height, width, _ = rgb_image.shape

    x_min = max(min(x_coordinates) - MARGIN, 0)
    x_max = min(max(x_coordinates) + MARGIN, width)
    y_min = max(min(y_coordinates) - MARGIN, 0)
    y_max = min(max(y_coordinates) + MARGIN, height)

    return rgb_image[y_min:y_max, x_min:x_max]

In [6]:
def cropping_on_pose(cv2image):
    # Convert the image from BGR (OpenCV default) to RGB.
    image_rgb = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGB)
    
    # Create a Mediapipe Image object with the appropriate format.
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2image)
    
    # Set up the pose landmarker options.
    base_options = python.BaseOptions(model_asset_path='/Users/sebastianosanson/Development/VCS_Project/task-model/pose_landmarker_heavy.task')
    options = vision.PoseLandmarkerOptions(
        base_options=base_options, 
        output_segmentation_masks=True
    )
    detector = vision.PoseLandmarker.create_from_options(options)
    
    # Detect the pose landmarks.
    detection_result = detector.detect(mp_image)
    
    # Draw landmarks (assuming your function supports Mediapipe's Image objects).
    annotated_image, x, y = draw_pose_landmarks_on_image(mp_image.numpy_view(), detection_result)
    
    # Check if the necessary coordinates were found.
    if not x or not y:
        return cv2image
    else:
        # Crop the image using your custom crop function.
        cropped_image = crop(mp_image.numpy_view(), x, y)
        return cropped_image

# TFLITE Model

In [7]:
# Load the gesture recognition model as a TFLite model
interpreter = tf.lite.Interpreter(model_path=TFLITE_MODEL_PATH)
interpreter.allocate_tensors()
# Get input and output tensor details from the TFLite model
input_details = interpreter.get_input_details()    # Expected shape: [1, 10, 2048]
output_details = interpreter.get_output_details()  # Expected shape: [1, num_classes]

INFO: Created TensorFlow Lite delegate for select TF ops.
INFO: TfLiteFlexDelegate delegate: 2 nodes delegated out of 38 nodes with 2 partitions.

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [None]:
cap  = cv2.VideoCapture(1)  # 0 is typically the default webcam

cap_width = 160
cap_height = 120

cap.set(cv2.CAP_PROP_FRAME_WIDTH, cap_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, cap_height)

desired_fps = 5
cap.set(cv2.CAP_PROP_FPS, desired_fps)

# Read back the FPS value to check if it was set correctly
actual_fps = cap.get(cv2.CAP_PROP_FPS)
print("Requested FPS:", desired_fps)
print("Actual FPS:", actual_fps)

if not cap.isOpened():
    raise RuntimeError("Could not open webcam.")

# --- Countdown Before Recording ---
countdown_time = 5  # seconds
start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame = cv2.flip(frame, 1)  # Mirror the frame

    elapsed = time.time() - start_time
    remaining = int(countdown_time - elapsed) + 1  # +1 to make the countdown feel natural
    if remaining <= 0:
        break

    cv2.putText(frame, f"Recording starts in {remaining}...", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Recording gesture...")
# CV2 Print on frame Recording gesture

cv2.putText(frame, "Recording gesture...", (50, 50),
            cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)

# --- Capture Frames for Gesture ---
frames_captured = 0
captured_frames = []  # list to store the raw frames

while frames_captured < 40:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        continue

    frame = cv2.flip(frame, 1)  # Mirror the frame

    # Optionally display the frame while capturing
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Append frame to our list
    captured_frames.append(frame.copy())
    # Optionally save each frame as PNG
    cv2.imwrite(f"/Users/sebastianosanson/Development/VCS_Project/captured_frames/frame_{frames_captured}.png", frame)

    frames_captured += 1

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()

selected_frames = captured_frames[::4]  # Take every 4th frame

# --- Process All Captured Frames ---
features_list = []

for idx, frame in enumerate(selected_frames):
    # --- Cropping on Pose ---
    cropped_image = cropping_on_pose(frame)
    if cropped_image is None:
        print(f"Cropping failed on frame {idx}. Skipping this frame.")
        continue

    cv2.imwrite(f"/Users/sebastianosanson/Development/VCS_Project/cropped_frames/frame_{idx}.png", cropped_image)

    # --- Preprocess Each Frame ---
    # Resize frame to the expected input size for InceptionV3
    frame_resized = cv2.resize(cropped_image, INCEPTION_SIZE)

    cv2.imwrite(f"/Users/sebastianosanson/Development/VCS_Project/resized_frames/frame_{idx}.png", frame_resized)
    # Convert BGR (default in OpenCV) to RGB
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    # Convert to tensor and add batch dimension
    image_tensor = tf.convert_to_tensor(frame_rgb, dtype=tf.float32)
    print(f"Preprocessed frame shape: {image_tensor.shape}")

    preprocessed_frame = np.expand_dims(image_tensor, axis=0)

    # --- Extract Features Using InceptionV3 ---
    features = feature_extractor.predict(preprocessed_frame)
    # Assuming features shape is (1, 2048), extract the feature vector
    print(f"Extracted features shape: {features.shape}")
    features_list.append(features[0])

if not features_list:
    raise ValueError("No valid frames were processed.")

# Convert the list of feature vectors into a numpy array with shape (N, 2048)
features_array = np.array(features_list, dtype='float32')
print("Captured features shape:", features_array.shape)

# --- Prepare Features for TFLite LSTM Model ---
# The LSTM expects a batch dimension, so reshape to (1, N, 2048)
input_for_lstm = np.expand_dims(features_array, axis=0)

# --- Perform Inference with the TFLite Model ---
input_data = input_for_lstm.astype(input_details[0]['dtype'])
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])
predicted_probabilities = output_data[0]

# --- Print the Predicted Probabilities for Each Gesture ---
print("Predicted probabilities:")
for i, prob in enumerate(predicted_probabilities):
    print(f"{index_to_label[i]}: {prob:.4f}")




Requested FPS: 5
Actual FPS: 30.0


2025-02-07 19:30:40.065 python[32943:414237] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-07 19:30:40.065 python[32943:414237] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Recording gesture...


I0000 00:00:1738953046.994218  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1738953047.156856  571235 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953047.256471  571234 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953047.361716  571235 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Extracted features shape: (1, 2048)


I0000 00:00:1738953048.720818  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953048.880343  571506 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953048.974605  571506 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
Extracted features shape: (1, 2048)


I0000 00:00:1738953049.628889  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953049.825006  571640 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953049.927353  571639 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step

I0000 00:00:1738953050.230189  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953050.384927  571745 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953050.470552  571745 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

I0000 00:00:1738953050.696778  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953050.953276  571822 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953051.011499  571822 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

I0000 00:00:1738953051.226348  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953051.387514  571920 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953051.452963  571920 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


I0000 00:00:1738953051.664046  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953051.803390  572010 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953051.851074  572013 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


I0000 00:00:1738953052.032082  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953052.218618  572097 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953052.270853  572097 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Extracted features shape: (1, 2048)
Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


I0000 00:00:1738953052.461853  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738953052.591094  572201 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953052.642736  572201 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1738953052.842317  414237 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1


Extracted features shape: (1, 2048)


W0000 00:00:1738953052.986639  572281 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738953053.032122  572283 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Preprocessed frame shape: (75, 75, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Extracted features shape: (1, 2048)
Captured features shape: (10, 2048)
Predicted probabilities:
scroll_down: 0.2373
scroll_left: 0.1400
scroll_right: 0.0624
scroll_up: 0.0094
zoom_in: 0.2234
zoom_out: 0.3275


: 

# Keras Model


In [5]:
# --- Step 1: Load your pre-trained models ---

# Load the LSTM model (replace 'model.keras' with your model filename)
lstm_model = keras.models.load_model(KERAS_MODEL_PATH)

In [22]:
# --- Step 2: Initialize webcam capture ---

cap = cv2.VideoCapture(0)  # 0 is typically the default webcam
if not cap.isOpened():
    raise RuntimeError("Could not open webcam.")

# --- Countdown Before Recording ---
countdown_time = 3  # seconds
start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    # Calculate remaining time for the countdown
    elapsed = time.time() - start_time
    remaining = int(countdown_time - elapsed) + 1  # +1 for a more natural countdown display
    if remaining <= 0:
        break

    # Overlay the countdown text on the frame
    cv2.putText(frame, f"Recording starts in {remaining}...", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
    cv2.imshow("Webcam", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Recording gesture...")

# --- Capture Frames for Gesture ---
frames_captured = 0
features_list = []

while frames_captured < 10:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        continue

    # Optionally display the frame while capturing
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # --- Preprocess Each Frame ---
    # Resize to InceptionV3 expected size
    frame_resized = cv2.resize(frame, INCEPTION_SIZE)
    # Convert BGR (OpenCV default) to RGB
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    # Convert to float32 and preprocess using InceptionV3 preprocessor
    frame_rgb = frame_rgb.astype('float32')
    preprocessed_frame = keras.applications.inception_v3.preprocess_input(frame_rgb)
    # Expand dims to create batch dimension: (1, 299, 299, 3)
    preprocessed_frame = np.expand_dims(preprocessed_frame, axis=0)

    # --- Extract Features Using InceptionV3 ---
    features = feature_extractor.predict(preprocessed_frame)
    # features will have shape (1, 2048); extract the vector
    features = features[0]
    features_list.append(features)

    frames_captured += 1

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()

# Convert list of feature vectors into a numpy array with shape (10, 2048)
features_array = np.array(features_list, dtype='float32')
print("Captured features shape:", features_array.shape)

# --- Prepare Features for LSTM Model ---
# The LSTM expects a batch dimension, so reshape to (1, 10, 2048)
input_for_lstm = np.expand_dims(features_array, axis=0)

# --- Perform Inference with the LSTM Model ---
prediction = lstm_model.predict(input_for_lstm)
# prediction is an array of shape (1, num_classes); extract the probabilities
predicted_probabilities = prediction[0]

# Print the predicted probabilities for each gesture
print("Predicted probabilities:")
for i, prob in enumerate(predicted_probabilities):
    print(f"{index_to_label[i]}: {prob:.4f}")


Recording gesture...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Captured features shape: (10, 2048)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step
Predicted probabilities:
scroll_down: 0.1233
scroll_left: 0.1292
scroll_right: 0.0419
scroll_up: 0.0077
zoom_in: 0.0018
zoom_out: 0.6961
