In [10]:
pip install pandas numpy opencv-python mediapipe scikit-learn joblib


Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
import urllib.request

import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import time

from mediapipe.tasks import python
from mediapipe.tasks.python import vision


In [16]:
MODEL_PATH = 'Data/hand_landmarker.task'
MODEL_URL = 'https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task'

os.makedirs('Data', exist_ok=True)
if not os.path.isfile(MODEL_PATH):
    print('Downloading hand landmarker model...')
    urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)

options = vision.HandLandmarkerOptions(
    base_options=python.BaseOptions(model_asset_path=MODEL_PATH),
    running_mode=vision.RunningMode.VIDEO,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.6,
)
landmarker = vision.HandLandmarker.create_from_options(options)

HAND_CONNECTIONS = [
    (0,1),(1,2),(2,3),(3,4),
    (0,5),(5,6),(6,7),(7,8),
    (5,9),(9,10),(10,11),(11,12),
    (9,13),(13,14),(14,15),(15,16),
    (13,17),(17,18),(18,19),(19,20),
    (0,17)
]

def draw_hand(frame, hand_landmarks):
    h, w = frame.shape[:2]
    pts = []
    for lm in hand_landmarks:
        x = int(lm.x * w)
        y = int(lm.y * h)
        pts.append((x, y))
        cv2.circle(frame, (x, y), 3, (0, 255, 255), -1)

    for i, j in HAND_CONNECTIONS:
        if i < len(pts) and j < len(pts):
            cv2.line(frame, pts[i], pts[j], (0, 255, 0), 2)


Downloading hand landmarker model...


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1772033904.758178   33806 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1772033904.783634   33803 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [17]:
def extract_hand(hand_landmarks):
    feat = []
    for lm in hand_landmarks:
        feat.extend([lm.x, lm.y, lm.z])
    return feat


In [18]:
cap = cv2.VideoCapture(0)

current_gesture = 'none'
collecting = False
records = []

print("""
q : quit
c : toggle collect
1-9 : set gesture (gesture_1, gesture_2...)
""")

frame_idx = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    frame_idx += 1
    ts_ms = int(time.time() * 1000)
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
    res = landmarker.detect_for_video(mp_image, ts_ms)

    L_feat = [0.0] * 63
    R_feat = [0.0] * 63
    L_exist = 0
    R_exist = 0

    if res.hand_landmarks:
        for hand_lm, handedness in zip(res.hand_landmarks, res.handedness):
            label = handedness[0].category_name
            feat = extract_hand(hand_lm)

            if label == 'Left':
                L_feat = feat
                L_exist = 1
            else:
                R_feat = feat
                R_exist = 1

            draw_hand(frame, hand_lm)

    if collecting:
        row = [ts_ms / 1000.0, current_gesture, L_exist, R_exist] + L_feat + R_feat
        records.append(row)

    cv2.putText(
        frame,
        f'Gesture: {current_gesture} | Collecting: {collecting}',
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.7,
        (0, 255, 0) if collecting else (0, 0, 255),
        2
    )

    cv2.imshow('Landmark Collector', frame)

    key = cv2.waitKey(1) & 0xFF

    if key == ord('q'):
        break
    elif key == ord('c'):
        collecting = not collecting
        print('Collecting:', collecting)
    elif ord('1') <= key <= ord('9'):
        current_gesture = f'gesture_{key - ord("0")}'
        print('Gesture set:', current_gesture)

cap.release()
cv2.destroyAllWindows()
landmarker.close()
print('Collected rows:', len(records))



q : quit
c : toggle collect
1-9 : set gesture (gesture_1, gesture_2...)



W0000 00:00:1772033912.921781   33806 landmark_projection_calculator.cc:78] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
QFontDatabase: Cannot find font directory /home/lystiger/Documents/ML2/mldm2/venv/lib/python3.12/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home/lystiger/Documents/ML2/mldm2/venv/lib/python3.12/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home/lystiger/Documents/ML2/mldm2/venv/lib/python3.12/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home

In [19]:
os.makedirs('Data', exist_ok=True)

columns = (
    ['ts', 'gesture', 'L_exist', 'R_exist']
    + [f'L_{a}{i}' for i in range(21) for a in ['x','y','z']]
    + [f'R_{a}{i}' for i in range(21) for a in ['x','y','z']]
)

if len(records) == 0:
    raise ValueError('No records collected. In camera window press c to start collecting, then q to stop.')

df = pd.DataFrame(records, columns=columns)
out_path = 'Data/hand_landmarks.csv'
df.to_csv(out_path, index=False)
print('Saved:', out_path, '| shape =', df.shape)


Saved: (0, 129)


## Train CV Gesture Models (SVM, Random Forest, KNN, Logistic Regression)


In [20]:
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


In [21]:
candidate_paths = [
    'Data/hand_landmarks.csv',
    'hand_landmarks.csv'
]

dataset_path = next((p for p in candidate_paths if os.path.isfile(p) and os.path.getsize(p) > 0), None)
if dataset_path is None:
    raise FileNotFoundError(
        'No non-empty hand_landmarks.csv found. Run collection cell, press c to collect, then run save cell.'
    )

df = pd.read_csv(dataset_path)
if df.empty:
    raise ValueError(f'Dataset is empty: {dataset_path}. Collect more samples first.')

print('Loaded:', dataset_path, '| shape =', df.shape)
df.head()


EmptyDataError: No columns to parse from file

In [None]:
label_candidates = ['gesture', 'label']
label_col = next((c for c in label_candidates if c in df.columns), None)
if label_col is None:
    raise ValueError('Dataset must contain a label column: gesture or label')

drop_cols = {'ts', 'timestamp_s', 'timestamp_ms', 'frame_time', label_col}
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)
y = df[label_col].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print('Label column:', label_col)
print('Feature count:', len(feature_cols))
print('Train/Test:', X_train.shape, X_test.shape)


In [None]:
models = {
    'svm': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='rbf', C=10, gamma='scale', probability=True, random_state=42))
    ]),
    'random_forest': RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    'knn': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier(n_neighbors=5))
    ]),
    'logistic': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=2000, random_state=42))
    ])
}

results = []
trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    trained_models[name] = model
    results.append({'model': name, 'accuracy': acc})

    print(f'\n=== {name.upper()} ===')
    print(f'Accuracy: {acc:.4f}')
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(results).sort_values('accuracy', ascending=False).reset_index(drop=True)
results_df


In [None]:
best_model_name = results_df.loc[0, 'model']
best_model = trained_models[best_model_name]

bundle = {
    'model_name': best_model_name,
    'model': best_model,
    'feature_cols': feature_cols,
    'label_col': label_col
}

os.makedirs('Data', exist_ok=True)
model_path = 'Data/cv_best_model.joblib'
joblib.dump(bundle, model_path)
print('Saved best model:', best_model_name, '->', model_path)


## Live Webcam Prediction With Trained Model
Run this after training. Press `q` to quit.


In [None]:
bundle = joblib.load('Data/cv_best_model.joblib')
model = bundle['model']
feature_cols = bundle['feature_cols']
print('Loaded model:', bundle['model_name'])

options = vision.HandLandmarkerOptions(
    base_options=python.BaseOptions(model_asset_path='Data/hand_landmarker.task'),
    running_mode=vision.RunningMode.VIDEO,
    num_hands=2
)
landmarker = vision.HandLandmarker.create_from_options(options)

cap = cv2.VideoCapture(0)
frame_idx = 0
start_ts_ms = int(time.time() * 1000)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    frame_idx += 1
    ts_ms = start_ts_ms + frame_idx
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
    res = landmarker.detect_for_video(mp_image, ts_ms)

    L_feat = [0.0] * 63
    R_feat = [0.0] * 63
    L_exist = 0
    R_exist = 0

    if res.hand_landmarks:
        for hand_lm, handedness in zip(res.hand_landmarks, res.handedness):
            label = handedness[0].category_name
            feat = extract_hand(hand_lm)
            if label == 'Left':
                L_feat = feat
                L_exist = 1
            else:
                R_feat = feat
                R_exist = 1

            draw_hand(frame, hand_lm)

    sample = {'L_exist': L_exist, 'R_exist': R_exist}

    for i in range(21):
        sample[f'L_x{i}'] = L_feat[i * 3 + 0]
        sample[f'L_y{i}'] = L_feat[i * 3 + 1]
        sample[f'L_z{i}'] = L_feat[i * 3 + 2]
        sample[f'R_x{i}'] = R_feat[i * 3 + 0]
        sample[f'R_y{i}'] = R_feat[i * 3 + 1]
        sample[f'R_z{i}'] = R_feat[i * 3 + 2]

    x_live = pd.DataFrame([sample])

    for c in feature_cols:
        if c not in x_live.columns:
            x_live[c] = 0.0

    x_live = x_live[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

    pred = model.predict(x_live)[0]
    conf = None
    if hasattr(model, 'predict_proba'):
        conf = float(model.predict_proba(x_live).max())

    text = f'Pred: {pred}' if conf is None else f'Pred: {pred} ({conf:.2f})'
    cv2.putText(frame, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    cv2.imshow('CV Live Prediction', frame)
    if (cv2.waitKey(1) & 0xFF) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
landmarker.close()
