In [1]:
!pip -q install mediapipe opencv-python-headless pandas numpy tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 4.25.8 whic

In [2]:
import os, json
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import mediapipe as mp

# =========================
# CONFIG
# =========================
DATA_ROOT = "/kaggle/input/tunisian-sign-language-dataset/(First ever) Tunisian Sign Language Dataset/Data"  # <-- change this
OUTPUT_CSV = "dataset_keypoints.csv"
OUTPUT_LABELS = "labels.json"
BAD_IMAGES_LOG = "bad_images.txt"

IMG_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".webp")


# =========================
# NORMALIZATION
# =========================
def normalize_landmarks(landmarks_xyz: np.ndarray) -> np.ndarray:
    """
    landmarks_xyz: shape (21, 3) in MediaPipe normalized coords [0..1] (x,y) and z ~ relative.
    Steps:
      1) center by wrist (index 0)
      2) scale by distance wrist->index_mcp (index 5) to make it size-invariant
    returns flattened vector shape (63,)
    """
    pts = landmarks_xyz.copy()

    wrist = pts[0]              # (x,y,z)
    pts = pts - wrist           # center

    ref = pts[5]                # index_mcp relative to wrist
    scale = np.linalg.norm(ref[:2]) + 1e-8  # use x,y distance (more stable)
    pts = pts / scale

    return pts.reshape(-1)      # (63,)


# =========================
# MEDIAPIPE INIT
# =========================
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=True,     # for images
    max_num_hands=1,            # 1 hand for now
    model_complexity=1,
    min_detection_confidence=0.5
)


# =========================
# SCAN CLASSES
# =========================
def list_classes_and_images(data_root: str):
    """
    Finds all leaf folders (words) under themes.
    Expected structure:
      Data/
        Demandes/
          oui/ img...
          non/ img...
        Jours/
          ...
    Returns list of (label, img_path)
    """
    samples = []
    for theme in sorted(os.listdir(data_root)):
        theme_path = os.path.join(data_root, theme)
        if not os.path.isdir(theme_path):
            continue

        for label in sorted(os.listdir(theme_path)):
            label_path = os.path.join(theme_path, label)
            if not os.path.isdir(label_path):
                continue

            for fn in os.listdir(label_path):
                if fn.lower().endswith(IMG_EXTS):
                    samples.append((label, os.path.join(label_path, fn)))

    return samples


samples = list_classes_and_images(DATA_ROOT)
print("Total images found:", len(samples))
print("Example:", samples[:3])


# =========================
# BUILD DATASET
# =========================
rows = []
bad = []

for label, path in tqdm(samples):
    img = cv2.imread(path)
    if img is None:
        bad.append(path)
        continue

    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    res = hands.process(img_rgb)

    if not res.multi_hand_landmarks:
        bad.append(path)
        continue

    lm = res.multi_hand_landmarks[0].landmark
    landmarks_xyz = np.array([[p.x, p.y, p.z] for p in lm], dtype=np.float32)  # (21,3)

    feat = normalize_landmarks(landmarks_xyz)  # (63,)
    row = {"label": label, "path": path}
    for i in range(63):
        row[f"f{i}"] = float(feat[i])
    rows.append(row)

df = pd.DataFrame(rows)
print("✅ Valid samples:", len(df))
print("❌ Bad samples:", len(bad))
display(df.head())


# =========================
# SAVE OUTPUTS
# =========================
df.to_csv(OUTPUT_CSV, index=False)

labels = sorted(df["label"].unique().tolist())
with open(OUTPUT_LABELS, "w", encoding="utf-8") as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)

with open(BAD_IMAGES_LOG, "w", encoding="utf-8") as f:
    for p in bad:
        f.write(p + "\n")

print("Saved:", OUTPUT_CSV, OUTPUT_LABELS, BAD_IMAGES_LOG)
print("Number of classes:", len(labels))
print("Classes:", labels[:20], "..." if len(labels) > 20 else "")


2025-12-15 00:24:19.919337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765758260.108822      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765758260.159073      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1765758272.216889     125 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1765758272.244352     125 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Total images found: 4423
Example: [('3aslema', '/kaggle/input/tunisian-sign-language-dataset/(First ever) Tunisian Sign Language Dataset/Data/Demandes/3aslema/3aslema(1).jpg'), ('3aslema', '/kaggle/input/tunisian-sign-language-dataset/(First ever) Tunisian Sign Language Dataset/Data/Demandes/3aslema/3aslema(18).jpg'), ('3aslema', '/kaggle/input/tunisian-sign-language-dataset/(First ever) Tunisian Sign Language Dataset/Data/Demandes/3aslema/3aslema(33).jpg')]


  0%|          | 0/4423 [00:00<?, ?it/s]W0000 00:00:1765758272.610819     124 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
100%|██████████| 4423/4423 [02:02<00:00, 35.97it/s]

✅ Valid samples: 3871
❌ Bad samples: 552





Unnamed: 0,label,path,f0,f1,f2,f3,f4,f5,f6,f7,...,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62
0,3aslema,/kaggle/input/tunisian-sign-language-dataset/(...,0.0,0.0,0.0,0.25894,-0.555474,-0.070004,0.825894,-0.839742,...,-0.61744,1.478223,0.401342,-0.597795,1.345782,0.35683,-0.453064,1.097425,0.37191,-0.358269
1,3aslema,/kaggle/input/tunisian-sign-language-dataset/(...,0.0,0.0,0.0,-0.31256,-0.357951,-0.1135,-0.729675,-0.53253,...,-0.18846,-1.120789,0.71018,-0.25459,-1.371996,0.749078,-0.289462,-1.576172,0.76745,-0.308617
2,3aslema,/kaggle/input/tunisian-sign-language-dataset/(...,0.0,0.0,0.0,0.332884,-0.300327,-0.034415,0.78703,-0.340908,...,-0.137908,0.840609,0.864979,-0.145318,1.026104,0.952996,-0.105731,1.160058,1.020696,-0.07053
3,3aslema,/kaggle/input/tunisian-sign-language-dataset/(...,0.0,0.0,0.0,0.345207,-0.301865,-0.040079,0.808136,-0.330393,...,-0.119071,0.839767,0.889445,-0.129981,1.030681,0.982355,-0.092482,1.170667,1.052416,-0.058343
4,3aslema,/kaggle/input/tunisian-sign-language-dataset/(...,0.0,0.0,0.0,0.325659,-0.30426,-0.054396,0.794374,-0.34783,...,-0.113627,0.822273,0.868083,-0.117329,1.013579,0.954293,-0.082376,1.153289,1.022712,-0.052568


Saved: dataset_keypoints.csv labels.json bad_images.txt
Number of classes: 57
Classes: ['3aslema', '3ayla', '5adamet', '5al-3am', '5mis', '5ou', 'a7ad', 'assam', 'baladya', 'banka', 'barnamjk', 'bent', 'bou', 'bousta', 'car', 'chabeb', 'cv', 'dar', 'demande', 'eben'] ...


In [3]:
import pandas as pd

df = pd.read_csv("dataset_keypoints.csv")
print(df.shape)                 # (N, 65) -> label + path + 63 features
print(df["label"].nunique())    # nb de mots
print(df["label"].value_counts().head(10))  # distribution


(3871, 65)
57
label
5al-3am    184
5ou        183
eben       159
bent       156
o5t        143
bou        137
jad        131
mar2a      124
se7a       116
baladya    109
Name: count, dtype: int64
