# Libraries

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import mediapipe as mp
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.svm import SVC
from xgboost import XGBClassifier
from scipy.spatial import ConvexHull
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
)

# Constants

In [None]:
CACHE_DIR = "./cache"
DATASET_DIR = "./dataset" 

DATASET_FILE = f"{CACHE_DIR}/dataset.csv"

LANDMARKS_PAIRS = [
    # Tip-to-tip (neighboring fingers)
    (4, 8), (8, 12), (12, 16), (16, 20),

    # Wrist to each fingertip
    (0, 4), (0, 8), (0, 12), (0, 16), (0, 20),

    # Tip to base of the same finger (to capture bending)
    (4, 3), (8, 6), (12, 10), (16, 14), (20, 18),

    # Thumb to other fingertips (e.g., letters like G, L)
    (4, 12), (4, 16), (4, 20),

    # Palm width: base-to-base
    (5, 17), (2, 20)
]
LANDMARKS_TRIPLETS = [
    # Index finger
    (5, 6, 7), (6, 7, 8),

    # Middle finger
    (9, 10, 11), (10, 11, 12),

    # Ring finger
    (13, 14, 15), (14, 15, 16),

    # Pinky finger
    (17, 18, 19), (18, 19, 20),

    # Thumb
    (1, 2, 3), (2, 3, 4),

    # Palm structure
    (0, 5, 9), (0, 9, 13), (0, 13, 17)
]
LANDMARKS_TIPS = [4, 8, 12, 16, 20]
LANDMARKS_PIPS = [3, 6, 10, 14, 18]

os.makedirs(CACHE_DIR, exist_ok=True)

# Initialize mediapipe

In [None]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Get the coordinates and extract the features

In [None]:
def normalize_landmarks(points):
  center = points[0]
  points = points - center

  scale = np.linalg.norm(points[0] - points[9])
  if scale != 0:
    points = points / scale
  
  return points

def pairwise_distances(landmarks, pairs):
  dists = []
  for i, j in pairs:
    dists.append(np.linalg.norm(landmarks[i] - landmarks[j]))
  return np.array(dists)

def angles(landmarks, triples):
  angs = []
  for i, j, k in triples:
    v1 = landmarks[i] - landmarks[j]
    v2 = landmarks[k] - landmarks[j]
    # cosθ = (v1·v2) / (‖v1‖‖v2‖)
    cosang = np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2) + 1e-8)
    θ = np.degrees(np.arccos(np.clip(cosang, -1, 1)))
    angs.append(θ)
  return np.array(angs)

def convex_hull_area(landmarks):
  hull = ConvexHull(landmarks)
  return hull.area

def count_extended_fingers(landmarks, finger_tips, finger_pips):
  wrist = landmarks[0]
  count = 0
  for tip, pip in zip(finger_tips, finger_pips):
    d_tip = np.linalg.norm(landmarks[tip] - wrist)
    d_pip = np.linalg.norm(landmarks[pip] - wrist)
    if d_tip > d_pip:
      count += 1
  return count

def bounding_box_features(landmarks):
  x = [pt[0] for pt in landmarks]
  y = [pt[1] for pt in landmarks]
  width = max(x) - min(x)
  height = max(y) - min(y)
  aspect_ratio = width / (height + 1e-6)
  return width, height, aspect_ratio

def hand_orientation_angle(landmarks):
  wrist = np.array(landmarks[0])
  middle_base = np.array(landmarks[9])
  vec = middle_base - wrist
  angle = np.arctan2(vec[1], vec[0])
  return angle

def create_dataset():
  data = []

  for label in tqdm(os.listdir(DATASET_DIR), desc="Reading and extracting", unit="label"):
    label_path = os.path.join(DATASET_DIR, label)
    for image in os.listdir(label_path):
      image_path = os.path.join(label_path, image)

      img = cv2.imread(image_path)
      img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
      results = hands.process(img_rgb)

      if not results.multi_hand_landmarks:
        print(f"Warning: No hand landmarks found in {image_path}. Skipping...")
        continue

      for hand in results.multi_hand_landmarks:
        h, w, _ = img.shape
        points = []
        for lm in hand.landmark:
          x = lm.x * w
          y = lm.y * h
          points.append([x, y])
        
        points = np.array(points)

        points = normalize_landmarks(points)

        dist_feats = pairwise_distances(points, LANDMARKS_PAIRS)
        ang_feats = angles(points, LANDMARKS_TRIPLETS)
        area = convex_hull_area(points)
        angle = hand_orientation_angle(points)
        n_extended = count_extended_fingers(points, LANDMARKS_TIPS, LANDMARKS_PIPS)
        width, height, aspect_ratio = bounding_box_features(points)

        sample = { "label": label }

        for i, val in enumerate(dist_feats):
          sample[f"dist_{i}"] = val
        
        for i, val in enumerate(ang_feats):
          sample[f"ang_{i}"] = val

        sample["area"] = area
        sample["angle"] = angle
        sample["width"] = width
        sample["height"] = height
        sample["aspect_ratio"] = aspect_ratio
        sample["n_extended"] = n_extended

        data.append(sample)

  df = pd.DataFrame(data)
  df.to_csv(DATASET_FILE, index=False)
    
if not os.path.exists(DATASET_FILE):
  print("Not found dataset file, creating...")
  create_dataset()
else:
  print("Dataset file found, loading...")

# Try to predict with models

In [None]:
df = pd.read_csv(DATASET_FILE)

X = df.drop(columns=["label"])
y = df["label"]

# Codificar etiquetas
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
n_classes = len(class_names)

# Binarizar etiquetas para ROC multiclase
y_binarized = label_binarize(y_encoded, classes=range(n_classes))

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
y_test_binarized = label_binarize(y_test, classes=range(n_classes))

# Modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=6),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
}

best_accuracy = 0
best_model = None

# Entrenar y evaluar
for name, model in models.items():
    print(f"\n🔍 Evaluando: {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)

    # Accuracy y reporte
    acc = accuracy_score(y_test, preds)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = name

    print(f"📊 Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, target_names=class_names))

    # Matriz de confusión
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.title(f"{name} - Matriz de Confusión (27 clases)")
    plt.xlabel("Predicho")
    plt.ylabel("Real")
    plt.tight_layout()
    plt.show()

    # ROC Curves
    fpr_micro, tpr_micro, _ = roc_curve(y_test_binarized.ravel(), probas.ravel())
    roc_auc_micro = auc(fpr_micro, tpr_micro)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr_micro, tpr_micro, label=f"Micro-average ROC (AUC = {roc_auc_micro:.2f})", color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.title(f"{name} - ROC Micro-average")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Predict with WebCam

In [None]:
cap = cv2.VideoCapture(0)

if not cap.isOpened():
  raise "The camera couldn't be open"

while True:
  ret, frame = cap.read()
  if not ret:
    print("Frame Error")
    break

  rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

  results = hands.process(rgb_frame)

  if results.multi_hand_landmarks:
    for hand in results.multi_hand_landmarks:
      mp_draw.draw_landmarks(frame, hand, mp_hands.HAND_CONNECTIONS)
      h, w, _ = frame.shape
      points = []
      for lm in hand.landmark:
        x = lm.x * w
        y = lm.y * h
        points.append([x, y])

      points = np.array(points)

      points = normalize_landmarks(points)
      dist_feats = pairwise_distances(points, LANDMARKS_PAIRS)
      ang_feats = angles(points, LANDMARKS_TRIPLETS)
      area = convex_hull_area(points)
      angle = hand_orientation_angle(points)
      n_extended = count_extended_fingers(points, LANDMARKS_TIPS, LANDMARKS_PIPS)
      width, height, aspect_ratio = bounding_box_features(points)

      sample = { }

      for i, val in enumerate(dist_feats):
        sample[f"dist_{i}"] = val
      
      for i, val in enumerate(ang_feats):
        sample[f"ang_{i}"] = val

      sample["area"] = area
      sample["angle"] = angle
      sample["width"] = width
      sample["height"] = height
      sample["aspect_ratio"] = aspect_ratio
      sample["n_extended"] = n_extended
      
      sample_df = pd.DataFrame([sample])
      sample_df = sample_df.reindex(columns=X.columns, fill_value=0)
      pred = models[best_model].predict(sample_df)[0]
      pred_label = le.inverse_transform([pred])[0]
      cv2.putText(frame, f"Predicted: {pred_label}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

  cv2.imshow('Webcam with Hand Landmarks', frame)

  if cv2.waitKey(1) & 0xFF == ord('q'):
    break

cap.release()
cv2.destroyAllWindows()