In [73]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow import keras
import time

KERAS_MODEL_PATH = 'C:\\Users\\Marco\\Downloads\\Telegram Desktop\\model.keras'
TFLITE_MODEL_PATH = 'model.tflite'

In [74]:
# --- Define Label Mapping ---
label_mapping = {
    'scroll_down': 0,
    'scroll_left': 1,
    'scroll_right': 2,
    'scroll_up': 3,
    'zoom_in': 4,
    'zoom_out': 5
}
# Create a reverse mapping for display purposes: index -> label
index_to_label = {v: k for k, v in label_mapping.items()}

# --- Load Models ---
# Load InceptionV3 as a feature extractor (for converting frames to 2048-dim feature vectors)
feature_extractor = keras.applications.InceptionV3(
    weights='imagenet',
    include_top=False,
    pooling='avg'  # Global average pooling to get a 2048-dim feature vector
)
# Expected input size for InceptionV3
INCEPTION_SIZE = (75, 75)

# TFLITE Model

In [75]:
# Load the gesture recognition model as a TFLite model
interpreter = tf.lite.Interpreter(model_path=TFLITE_MODEL_PATH)
interpreter.allocate_tensors()
# Get input and output tensor details from the TFLite model
input_details = interpreter.get_input_details()    # Expected shape: [1, 10, 2048]
output_details = interpreter.get_output_details()  # Expected shape: [1, num_classes]

In [76]:
# --- Step 2: Initialize Webcam Capture ---
cap = cv2.VideoCapture(0)  # 0 is typically the default webcam

desired_fps = 5
cap.set(cv2.CAP_PROP_FPS, desired_fps)

# Read back the FPS value to check if it was set correctly
actual_fps = cap.get(cv2.CAP_PROP_FPS)
print("Requested FPS:", desired_fps)
print("Actual FPS:", actual_fps)

if not cap.isOpened():
    raise RuntimeError("Could not open webcam.")

# --- Countdown Before Recording ---
countdown_time = 3  # seconds
start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    # Calculate remaining time for the countdown
    elapsed = time.time() - start_time
    remaining = int(countdown_time - elapsed) + 1  # +1 to make the countdown feel natural
    if remaining <= 0:
        break

    # Overlay the countdown text on the frame
    cv2.putText(frame, f"Recording starts in {remaining}...", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Recording gesture...")

# --- Capture Frames for Gesture ---
frames_captured = 0
features_list = []

while frames_captured < 10:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        continue

    # Optionally display the frame while capturing
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Save Each frame as png
    cv2.imwrite(f"frame_{frames_captured}.png", frame)

    # --- Preprocess Each Frame ---
    # Resize frame to the expected input size for InceptionV3
    frame_resized = cv2.resize(frame, INCEPTION_SIZE)
    # Convert BGR (default in OpenCV) to RGB
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    # Convert to float32 and preprocess for InceptionV3
    frame_rgb = frame_rgb.astype('float32')
    preprocessed_frame = keras.applications.inception_v3.preprocess_input(frame_rgb)
    # Expand dims to create a batch dimension (1, 299, 299, 3)
    preprocessed_frame = np.expand_dims(preprocessed_frame, axis=0)

    # --- Extract Features Using InceptionV3 ---
    features = feature_extractor.predict(preprocessed_frame)
    # features has shape (1, 2048); extract the feature vector
    features = features[0]
    features_list.append(features)

    frames_captured += 1

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()

# Convert the list of feature vectors into a numpy array with shape (10, 2048)
features_array = np.array(features_list, dtype='float32')
print("Captured features shape:", features_array.shape)

# --- Prepare Features for TFLite LSTM Model ---
# The LSTM expects a batch dimension, so reshape to (1, 10, 2048)
input_for_lstm = np.expand_dims(features_array, axis=0)

# --- Perform Inference with the TFLite Model ---
# Ensure the input data matches the expected type
input_data = input_for_lstm.astype(input_details[0]['dtype'])
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
# Retrieve the output from the model
output_data = interpreter.get_tensor(output_details[0]['index'])
# output_data is of shape (1, num_classes); extract probabilities from the first (and only) sample
predicted_probabilities = output_data[0]

# --- Print the Predicted Probabilities for Each Gesture ---
print("Predicted probabilities:")
for i, prob in enumerate(predicted_probabilities):
    print(f"{index_to_label[i]}: {prob:.4f}")

Requested FPS: 5
Actual FPS: 5.0
Recording gesture...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Captured features shape: (10, 2048)
Predicted probabilities:
scroll_down: 0.0156
scroll_left: 0.0110
scroll_right: 0.0008
scroll_up: 0.0005
zoom_in: 0.0019
zoom_out: 0.9702


# Keras Model


In [5]:
# --- Step 1: Load your pre-trained models ---

# Load the LSTM model (replace 'model.keras' with your model filename)
lstm_model = keras.models.load_model(KERAS_MODEL_PATH)

In [22]:
# --- Step 2: Initialize webcam capture ---

cap = cv2.VideoCapture(0)  # 0 is typically the default webcam
if not cap.isOpened():
    raise RuntimeError("Could not open webcam.")

# --- Countdown Before Recording ---
countdown_time = 3  # seconds
start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    # Calculate remaining time for the countdown
    elapsed = time.time() - start_time
    remaining = int(countdown_time - elapsed) + 1  # +1 for a more natural countdown display
    if remaining <= 0:
        break

    # Overlay the countdown text on the frame
    cv2.putText(frame, f"Recording starts in {remaining}...", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
    cv2.imshow("Webcam", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Recording gesture...")

# --- Capture Frames for Gesture ---
frames_captured = 0
features_list = []

while frames_captured < 10:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        continue

    # Optionally display the frame while capturing
    cv2.imshow("Webcam", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # --- Preprocess Each Frame ---
    # Resize to InceptionV3 expected size
    frame_resized = cv2.resize(frame, INCEPTION_SIZE)
    # Convert BGR (OpenCV default) to RGB
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    # Convert to float32 and preprocess using InceptionV3 preprocessor
    frame_rgb = frame_rgb.astype('float32')
    preprocessed_frame = keras.applications.inception_v3.preprocess_input(frame_rgb)
    # Expand dims to create batch dimension: (1, 299, 299, 3)
    preprocessed_frame = np.expand_dims(preprocessed_frame, axis=0)

    # --- Extract Features Using InceptionV3 ---
    features = feature_extractor.predict(preprocessed_frame)
    # features will have shape (1, 2048); extract the vector
    features = features[0]
    features_list.append(features)

    frames_captured += 1

# Release the webcam and close any OpenCV windows
cap.release()
cv2.destroyAllWindows()

# Convert list of feature vectors into a numpy array with shape (10, 2048)
features_array = np.array(features_list, dtype='float32')
print("Captured features shape:", features_array.shape)

# --- Prepare Features for LSTM Model ---
# The LSTM expects a batch dimension, so reshape to (1, 10, 2048)
input_for_lstm = np.expand_dims(features_array, axis=0)

# --- Perform Inference with the LSTM Model ---
prediction = lstm_model.predict(input_for_lstm)
# prediction is an array of shape (1, num_classes); extract the probabilities
predicted_probabilities = prediction[0]

# Print the predicted probabilities for each gesture
print("Predicted probabilities:")
for i, prob in enumerate(predicted_probabilities):
    print(f"{index_to_label[i]}: {prob:.4f}")


Recording gesture...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Captured features shape: (10, 2048)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step
Predicted probabilities:
scroll_down: 0.1233
scroll_left: 0.1292
scroll_right: 0.0419
scroll_up: 0.0077
zoom_in: 0.0018
zoom_out: 0.6961
