In [3]:
!pip install mediapipe numpy opencv-python tensorflow[and-cuda] pandas joblib

Collecting mediapipe
  Using cached mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy
  Using cached numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting joblib
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting tensorflow[and-cuda]
  Downloading tensorflow-2.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py (from mediapipe)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting jax (from mediapipe)
  Downloading jax-0.6.

In [14]:
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import joblib

# Załaduj modele
model_static = load_model('../model/StaticBiLSTM/static_gestures_model.keras')
label_encoder_static = joblib.load('../model/StaticBiLSTM/static_gestures_labels.pkl')

model_dynamic = load_model('../model/DynamicBiLSTM/dynamic_gestures_model.keras')
label_encoder_dynamic = joblib.load('../model/DynamicBiLSTM/dynamic_gestures_labels.pkl')

# Parametry
motion_threshold = 1.8  # Próg rozróżnienia statyczny/dynamiczny
motion_history = []
motion_history_len = 5
cooldown_frames = 0
cooldown_threshold = 10
recognized_sign = ''
recognized_type = ''
last_prediction = None
static_buffer = []
dynamic_buffer = []

# MediaPipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
def normalize_landmarks_buffer(buffer):
    data = []
    for frame_idx, row in enumerate(buffer):
        if len(row) != 63:
            print(f"[WARNING] Frame {frame_idx} has invalid length {len(row)}")
            continue
        frame_data = {'frame': frame_idx}
        for i in range(21):
            frame_data[f'landmark_{i}_x'] = row[i * 3]
            frame_data[f'landmark_{i}_y'] = row[i * 3 + 1]
            frame_data[f'landmark_{i}_z'] = row[i * 3 + 2]
        data.append(frame_data)

    if len(data) == 0:
        print("[WARNING] No valid frames in buffer")
        return np.zeros((1, 63))

    df = pd.DataFrame(data)
    df = df.sort_values("frame").copy()

    wrist_ref = {
        axis: df[df["frame"] == 0].iloc[0][f"landmark_1_{axis}"]
        for axis in ['x', 'y', 'z']
    }

    for axis in ['x', 'y', 'z']:
        for i in range(1, 21):
            col = f'landmark_{i}_{axis}'
            df[col] = df[col] - wrist_ref[axis]

    norm_buffer = []
    for _, row in df.iterrows():
        landmarks = []
        for i in range(21):
            landmarks.extend([
                row[f'landmark_{i}_x'],
                row[f'landmark_{i}_y'],
                row[f'landmark_{i}_z']
            ])
        norm_buffer.append(landmarks)

    norm_array = np.array(norm_buffer)
    print(f"[DEBUG] Normalized buffer shape: {norm_array.shape}")
    return norm_array

def extract_right_hand_landmarks(results):
    if results.right_hand_landmarks:
        return [coord for lm in results.right_hand_landmarks.landmark for coord in (lm.x, lm.y, lm.z)]
    return None

def get_hand_bbox(landmarks, image_width, image_height):
    xs = [lm.x * image_width for lm in landmarks.landmark]
    ys = [lm.y * image_height for lm in landmarks.landmark]
    x_min, x_max = max(int(min(xs)), 0), min(int(max(xs)), image_width)
    y_min, y_max = max(int(min(ys)), 0), min(int(max(ys)), image_height)
    return (x_min, y_min, x_max, y_max)

# Inicjalizacja kamery
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Cannot open camera")
    exit()

ret, prev_frame = cap.read()
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_diff = cv2.absdiff(prev_gray, curr_gray)
        motion_level = np.sum(frame_diff) / (frame.shape[0] * frame.shape[1])
        prev_gray = curr_gray.copy()

        # Wygładzanie ruchu
        motion_history.append(motion_level)
        if len(motion_history) > motion_history_len:
            motion_history.pop(0)
        smooth_motion = np.mean(motion_history)

        print(f"[DEBUG] Motion level: {motion_level:.3f}, Smooth motion: {smooth_motion:.3f}")

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(image_rgb)
        image_height, image_width, _ = frame.shape

        landmarks = extract_right_hand_landmarks(results)

        if landmarks:
            mp_drawing.draw_landmarks(
                frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

            bbox = get_hand_bbox(results.right_hand_landmarks, image_width, image_height)
            x1, y1, x2, y2 = bbox
            box_color = (0, 255, 0) if recognized_type == "Statyczny" else (255, 0, 0)
            cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)

            if smooth_motion < motion_threshold:
                dynamic_buffer.clear()
                static_buffer.append(landmarks)
                # print(f"[DEBUG] Static buffer size: {len(static_buffer)}")
                if len(static_buffer) >= 5:
                    if cooldown_frames == 0:
                        # print("[DEBUG] Running static prediction...")
                        norm_static = normalize_landmarks_buffer(static_buffer)
                        if norm_static.shape[0] == 5:
                            norm_static = np.expand_dims(norm_static, axis=0)
                            prediction = model_static.predict(norm_static)[0]
                            label = label_encoder_static.inverse_transform([np.argmax(prediction)])[0]
                            conf = np.max(prediction)
                            print(f"[STATIC] Predicted: {label} (conf: {conf:.2f})")
                            if conf > 0.5 and label != last_prediction:
                               recognized_sign = label
                               recognized_type = "Statyczny"
                               last_prediction = label
                               cooldown_frames = cooldown_threshold
                               static_buffer.clear()
                               dynamic_buffer.clear()
                               print(f"[INFO] Recognized static gesture: {recognized_sign} with confidence {conf:.2f}")
                        else:
                            print("[WARNING] Normalized static buffer shape unexpected:", norm_static.shape)
                        static_buffer.clear()
                    else:
                        print(f"[DEBUG] Static prediction skipped due to cooldown: {cooldown_frames}")
            else:
                static_buffer.clear()
                dynamic_buffer.append(landmarks)
                # print(f"[DEBUG] Dynamic buffer size: {len(dynamic_buffer)}")
                if len(dynamic_buffer) >= 30:
                    if cooldown_frames == 0:
                        # print("[DEBUG] Running dynamic prediction...")
                        norm_dynamic = normalize_landmarks_buffer(dynamic_buffer)
                        if norm_dynamic.shape[0] == 30:
                            norm_dynamic = np.expand_dims(norm_dynamic, axis=0)
                            prediction = model_dynamic.predict(norm_dynamic)[0]
                            label = label_encoder_dynamic.inverse_transform([np.argmax(prediction)])[0]
                            conf = np.max(prediction)
                            print(f"[DYNAMIC] Predicted: {label} (conf: {conf:.2f})")
                            if conf > 0.5 and label != last_prediction:
                                recognized_sign = label
                                recognized_type = "Dynamiczny"
                                last_prediction = label
                                cooldown_frames = cooldown_threshold
                                static_buffer.clear()
                                dynamic_buffer.clear()
                                print(f"[INFO] Recognized dynamic gesture: {recognized_sign} with confidence {conf:.2f}")
                        else:
                            print("[WARNING] Normalized dynamic buffer shape unexpected:", norm_dynamic.shape)
                        dynamic_buffer.clear()
                    else:
                        print(f"[DEBUG] Dynamic prediction skipped due to cooldown: {cooldown_frames}")

            # Rysowanie labela na górze ekranu, wyśrodkowany
            if recognized_sign and recognized_type:
                label_text = f'{recognized_sign} ({recognized_type})'
                label_pos_x = image_width // 2
                label_pos_y = 30
                print(f"[DEBUG] Drawing label: {label_text} at ({label_pos_x}, {label_pos_y})")
                (text_width, text_height), baseline = cv2.getTextSize(
                    label_text, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 3)
                text_x = label_pos_x - text_width // 2
                text_y = label_pos_y
                cv2.putText(
                    frame,
                    label_text,
                    (text_x, text_y),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1.0,
                    (0, 255, 0) if recognized_type == "Statyczny" else (255, 0, 0),
                    3,
                    lineType=cv2.LINE_AA
                )
        else:
            static_buffer.clear()
            dynamic_buffer.clear()
            recognized_sign = ''
            recognized_type = ''
            last_prediction = None

        cooldown_frames = max(0, cooldown_frames - 1)

        cv2.imshow('Sign Language Recognition', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1748774948.380845   28916 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1748774948.381728   38706 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics 620 (KBL GT2)
W0000 00:00:1748774948.446154   38697 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774948.461765   38701 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774948.464360   38697 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774948.465118   38699 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback 

[DEBUG] Motion level: 1.513, Smooth motion: 1.513
[DEBUG] Motion level: 1.431, Smooth motion: 1.472
[DEBUG] Motion level: 1.482, Smooth motion: 1.475
[DEBUG] Motion level: 1.617, Smooth motion: 1.511
[DEBUG] Motion level: 1.423, Smooth motion: 1.493
[DEBUG] Motion level: 1.555, Smooth motion: 1.502
[DEBUG] Motion level: 1.460, Smooth motion: 1.508
[DEBUG] Motion level: 2.632, Smooth motion: 1.737
[DEBUG] Motion level: 1.518, Smooth motion: 1.718
[DEBUG] Motion level: 1.450, Smooth motion: 1.723
[DEBUG] Motion level: 1.399, Smooth motion: 1.692
[DEBUG] Motion level: 1.360, Smooth motion: 1.672
[DEBUG] Motion level: 1.394, Smooth motion: 1.424
[DEBUG] Motion level: 1.385, Smooth motion: 1.398
[DEBUG] Motion level: 1.481, Smooth motion: 1.404
[DEBUG] Motion level: 1.376, Smooth motion: 1.399
[DEBUG] Motion level: 1.390, Smooth motion: 1.405
[DEBUG] Motion level: 1.428, Smooth motion: 1.412
[DEBUG] Motion level: 1.505, Smooth motion: 1.436
[DEBUG] Motion level: 1.422, Smooth motion: 1.424


In [12]:
import cv2
import mediapipe as mp
import numpy as np
from tensorflow.keras.models import load_model
import joblib

# Załaduj modele
model_static = load_model('../model/StaticBiLSTM/static_gestures_model.keras')
label_encoder_static = joblib.load('../model/StaticBiLSTM/static_gestures_labels.pkl')

model_dynamic = load_model('../model/DynamicBiLSTM/dynamic_gestures_model.keras')
label_encoder_dynamic = joblib.load('../model/DynamicBiLSTM/dynamic_gestures_labels.pkl')

# Parametry
motion_threshold = 1.9
motion_history = []
motion_history_len = 5
cooldown_frames = 0
cooldown_threshold = 10
recognized_sign = ''
recognized_type = ''
last_prediction = None
static_buffer = []
dynamic_buffer = []

mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def normalize_landmarks_buffer(buffer):
    buffer = np.array(buffer)
    if buffer.shape[0] == 0:
        return np.zeros((1, 63))
    wrist_ref = buffer[0, 3:6]  # landmark_1 x,y,z
    norm_buffer = buffer.copy()
    for i in range(1, 21):
        norm_buffer[:, i*3:(i+1)*3] -= wrist_ref
    return norm_buffer

def extract_right_hand_landmarks(results):
    if results.right_hand_landmarks:
        return [coord for lm in results.right_hand_landmarks.landmark for coord in (lm.x, lm.y, lm.z)]
    return None

def get_hand_bbox(landmarks, image_width, image_height):
    xs = [lm.x * image_width for lm in landmarks.landmark]
    ys = [lm.y * image_height for lm in landmarks.landmark]
    x_min, x_max = max(int(min(xs)), 0), min(int(max(xs)), image_width)
    y_min, y_max = max(int(min(ys)), 0), min(int(max(ys)), image_height)
    return (x_min, y_min, x_max, y_max)

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Cannot open camera")
    exit()

ret, prev_frame = cap.read()
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_diff = cv2.absdiff(prev_gray, curr_gray)
        motion_level = np.sum(frame_diff) / (frame.shape[0] * frame.shape[1])
        prev_gray = curr_gray.copy()

        motion_history.append(motion_level)
        if len(motion_history) > motion_history_len:
            motion_history.pop(0)
        smooth_motion = np.mean(motion_history)

        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(image_rgb)
        image_height, image_width, _ = frame.shape

        landmarks = extract_right_hand_landmarks(results)

        if landmarks:
            mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
            bbox = get_hand_bbox(results.right_hand_landmarks, image_width, image_height)
            x1, y1, x2, y2 = bbox
            box_color = (0, 255, 0) if recognized_type == "Statyczny" else (255, 0, 0)
            cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)

            if smooth_motion < motion_threshold:
                dynamic_buffer.clear()
                static_buffer.append(landmarks)
                if len(static_buffer) >= 5 and cooldown_frames == 0:
                    norm_static = normalize_landmarks_buffer(static_buffer)
                    if norm_static.shape[0] == 5:
                        norm_static = np.expand_dims(norm_static, axis=0)
                        prediction = model_static.predict(norm_static, verbose=0)[0]
                        label = label_encoder_static.inverse_transform([np.argmax(prediction)])[0]
                        conf = np.max(prediction)
                        if conf > 0.8 and label != last_prediction:
                            recognized_sign = label
                            recognized_type = "Statyczny"
                            last_prediction = label
                            cooldown_frames = cooldown_threshold
                            static_buffer.clear()
                            dynamic_buffer.clear()
            else:
                static_buffer.clear()
                dynamic_buffer.append(landmarks)
                if len(dynamic_buffer) >= 30 and cooldown_frames == 0:
                    norm_dynamic = normalize_landmarks_buffer(dynamic_buffer)
                    if norm_dynamic.shape[0] == 30:
                        norm_dynamic = np.expand_dims(norm_dynamic, axis=0)
                        prediction = model_dynamic.predict(norm_dynamic, verbose=0)[0]
                        label = label_encoder_dynamic.inverse_transform([np.argmax(prediction)])[0]
                        conf = np.max(prediction)
                        if conf > 0.8 and label != last_prediction:
                            recognized_sign = label
                            recognized_type = "Dynamiczny"
                            last_prediction = label
                            cooldown_frames = cooldown_threshold
                            static_buffer.clear()
                            dynamic_buffer.clear()

            if recognized_sign and recognized_type:
                label_text = f'{recognized_sign} ({recognized_type})'
                label_pos_x = image_width // 2
                label_pos_y = 30
                (text_width, _), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 3)
                text_x = label_pos_x - text_width // 2
                cv2.putText(frame, label_text, (text_x, label_pos_y), cv2.FONT_HERSHEY_SIMPLEX,
                            1.0, (0, 255, 0) if recognized_type == "Statyczny" else (255, 0, 0), 3, cv2.LINE_AA)
        else:
            static_buffer.clear()
            dynamic_buffer.clear()
            recognized_sign = ''
            recognized_type = ''
            last_prediction = None

        cooldown_frames = max(0, cooldown_frames - 1)

        cv2.imshow('Sign Language Recognition', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1748774800.628461   28916 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1748774800.629271   37621 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics 620 (KBL GT2)
W0000 00:00:1748774800.708091   37613 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774800.733232   37615 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774800.735013   37617 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748774800.735096   37611 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback 