# Sign Language Dataset Exploration

This notebook explores sign language datasets used in the LinguaSign project. We'll visualize the data, extract statistics, and gain insights into the structure of the datasets.

## Setup

First, let's import the necessary libraries and set up our environment.

In [None]:
# Add parent directory to path for imports
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

# Standard libraries
import json
import pickle
import random
from pathlib import Path

# Data processing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Computer vision libraries
import cv2
import mediapipe as mp

# Set plotting style
plt.style.use('fivethirtyeight')
sns.set_palette('viridis')
%matplotlib inline

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

## Dataset Configuration

Set up paths to the datasets we want to explore.

In [None]:
# Define dataset paths
DATASETS_DIR = Path('../datasets/raw')
WLASL_DIR = DATASETS_DIR / 'wlasl'
PHOENIX_DIR = DATASETS_DIR / 'phoenix'

# Check if datasets exist
print(f"WLASL dataset exists: {WLASL_DIR.exists()}")
print(f"PHOENIX dataset exists: {PHOENIX_DIR.exists()}")

If the datasets don't exist, you might need to download them first. You can use the scripts in the `datasets/download_scripts` directory.

## Exploring WLASL Dataset

Let's explore the WLASL (Word-Level American Sign Language) dataset.

In [None]:
# Load WLASL metadata
wlasl_json_path = list(WLASL_DIR.glob('WLASL_*.json'))[0]
with open(wlasl_json_path, 'r') as f:
    wlasl_data = json.load(f)

print(f"Number of glosses: {len(wlasl_data)}")

# Count total videos
total_videos = sum(len(item['instances']) for item in wlasl_data)
print(f"Total number of videos: {total_videos}")

# Get the top 10 glosses with the most videos
gloss_counts = [(item['gloss'], len(item['instances'])) for item in wlasl_data]
gloss_counts.sort(key=lambda x: x[1], reverse=True)
top_glosses = gloss_counts[:10]

# Plot the top glosses
plt.figure(figsize=(12, 6))
sns.barplot(x=[gloss for gloss, _ in top_glosses], y=[count for _, count in top_glosses])
plt.title('Top 10 Glosses by Number of Videos')
plt.xlabel('Gloss')
plt.ylabel('Number of Videos')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Let's look at the distribution of videos per gloss.

In [None]:
# Get counts of videos per gloss
video_counts = [len(item['instances']) for item in wlasl_data]

# Plot histogram
plt.figure(figsize=(12, 6))
plt.hist(video_counts, bins=50)
plt.title('Distribution of Videos per Gloss')
plt.xlabel('Number of Videos')
plt.ylabel('Number of Glosses')
plt.grid(True)
plt.show()

# Summary statistics
print(f"Mean videos per gloss: {np.mean(video_counts):.2f}")
print(f"Median videos per gloss: {np.median(video_counts)}")
print(f"Min videos per gloss: {np.min(video_counts)}")
print(f"Max videos per gloss: {np.max(video_counts)}")

## Visualizing Sign Language Data

Let's visualize some videos from the dataset to better understand the data.

In [None]:
# Initialize MediaPipe solutions
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def visualize_video(video_path, max_frames=10):
    """Visualize a sign language video with MediaPipe landmarks."""
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return
    
    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video properties:")
    print(f"  - Dimensions: {frame_width}x{frame_height}")
    print(f"  - FPS: {fps}")
    print(f"  - Total frames: {frame_count}")
    
    # Sample frames
    frame_indices = np.linspace(0, frame_count - 1, max_frames, dtype=int)
    frames = []
    
    # Initialize MediaPipe Holistic
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for idx in frame_indices:
            # Set frame position
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            success, image = cap.read()
            if not success:
                continue
            
            # Convert BGR image to RGB
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # Process image with MediaPipe
            results = holistic.process(image_rgb)
            
            # Draw landmarks
            annotated_image = image_rgb.copy()
            
            # Draw pose landmarks
            if results.pose_landmarks:
                mp_drawing.draw_landmarks(
                    annotated_image, 
                    results.pose_landmarks, 
                    mp_holistic.POSE_CONNECTIONS
                )
            
            # Draw hand landmarks
            if results.left_hand_landmarks:
                mp_drawing.draw_landmarks(
                    annotated_image, 
                    results.left_hand_landmarks, 
                    mp_holistic.HAND_CONNECTIONS
                )
            if results.right_hand_landmarks:
                mp_drawing.draw_landmarks(
                    annotated_image, 
                    results.right_hand_landmarks, 
                    mp_holistic.HAND_CONNECTIONS
                )
            
            frames.append(annotated_image)
    
    # Release video capture
    cap.release()
    
    # Plot frames
    fig, axes = plt.subplots(2, 5, figsize=(20, 8))
    axes = axes.flatten()
    
    for i, frame in enumerate(frames):
        if i >= len(axes):
            break
        axes[i].imshow(frame)
        axes[i].set_title(f"Frame {frame_indices[i]}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Find a random video from the dataset
random_gloss = random.choice(wlasl_data)
random_instance = random.choice(random_gloss['instances'])
video_id = random_instance['video_id']
video_path = WLASL_DIR / 'videos' / f"{video_id}.mp4"

print(f"Selected gloss: {random_gloss['gloss']}")
print(f"Video ID: {video_id}")

if video_path.exists():
    # Visualize the video
    visualize_video(str(video_path))
else:
    print(f"Video file not found: {video_path}")

## Exploring Processed Features

Now let's look at the processed features that are used for model training.

In [None]:
# Path to processed data
PROCESSED_DIR = Path('../datasets/processed/wlasl')
LANDMARKS_DIR = PROCESSED_DIR / 'landmarks'

# Check if processed data exists
if not LANDMARKS_DIR.exists():
    print("Processed landmarks directory not found. Please run preprocessing scripts first.")
else:
    # Count landmark files
    landmark_files = list(LANDMARKS_DIR.glob('*.pkl'))
    print(f"Number of landmark files: {len(landmark_files)}")
    
    # Load a random landmark file
    random_landmark_file = random.choice(landmark_files)
    with open(random_landmark_file, 'rb') as f:
        landmarks = pickle.load(f)
    
    print(f"Selected landmark file: {random_landmark_file.name}")
    print(f"Number of frames: {len(landmarks)}")
    
    # Print the keys in the landmarks
    print(f"Keys in landmarks: {landmarks[0].keys()}")
    
    # Show shapes of different landmarks
    for key in landmarks[0].keys():
        if landmarks[0][key] is not None:
            print(f"{key} shape: {landmarks[0][key].shape}")

## Visualizing Hand Landmarks

Let's visualize the hand landmarks for a better understanding of the data.

In [None]:
def plot_hand_landmarks(landmarks, frame_idx=0):
    """Plot hand landmarks in 3D."""
    frame = landmarks[frame_idx]
    
    # Check if hand landmarks exist
    if frame['left_hand'] is None and frame['right_hand'] is None:
        print("No hand landmarks found in this frame.")
        return
    
    fig = plt.figure(figsize=(10, 5))
    
    # Plot left hand
    if frame['left_hand'] is not None:
        ax1 = fig.add_subplot(121, projection='3d')
        x = frame['left_hand'][:, 0]
        y = frame['left_hand'][:, 1]
        z = frame['left_hand'][:, 2]
        ax1.scatter(x, y, z, c=range(len(x)), cmap='viridis')
        
        # Connect landmarks according to MediaPipe hand connections
        connections = mp_holistic.HAND_CONNECTIONS
        for connection in connections:
            start_idx = connection[0]
            end_idx = connection[1]
            ax1.plot([x[start_idx], x[end_idx]], 
                     [y[start_idx], y[end_idx]], 
                     [z[start_idx], z[end_idx]], 'k-')
        
        ax1.set_title('Left Hand')
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.set_zlabel('Z')
    
    # Plot right hand
    if frame['right_hand'] is not None:
        ax2 = fig.add_subplot(122, projection='3d')
        x = frame['right_hand'][:, 0]
        y = frame['right_hand'][:, 1]
        z = frame['right_hand'][:, 2]
        ax2.scatter(x, y, z, c=range(len(x)), cmap='viridis')
        
        # Connect landmarks according to MediaPipe hand connections
        connections = mp_holistic.HAND_CONNECTIONS
        for connection in connections:
            start_idx = connection[0]
            end_idx = connection[1]
            ax2.plot([x[start_idx], x[end_idx]], 
                     [y[start_idx], y[end_idx]], 
                     [z[start_idx], z[end_idx]], 'k-')
        
        ax2.set_title('Right Hand')
        ax2.set_xlabel('X')
        ax2.set_ylabel('Y')
        ax2.set_zlabel('Z')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot hand landmarks for a random frame
if 'landmarks' in locals():
    # Get a frame index where both hands are visible if possible
    valid_frames = [i for i, frame in enumerate(landmarks) 
                   if frame['left_hand'] is not None or frame['right_hand'] is not None]
    
    if valid_frames:
        frame_idx = random.choice(valid_frames)
        print(f"Plotting hand landmarks for frame {frame_idx}")
        plot_hand_landmarks(landmarks, frame_idx)
    else:
        print("No frames with hand landmarks found.")

## Dataset Statistics

Let's compute some statistics about the dataset.

In [None]:
# If processed data exists, compute statistics
if LANDMARKS_DIR.exists():
    # Collect statistics from a sample of landmark files
    sample_size = min(100, len(landmark_files))
    sample_files = random.sample(landmark_files, sample_size)
    
    # Statistics to collect
    frame_counts = []
    left_hand_counts = []
    right_hand_counts = []
    both_hands_counts = []
    no_hands_counts = []
    
    # Collect statistics
    for file in tqdm(sample_files, desc="Computing statistics"):
        with open(file, 'rb') as f:
            lm = pickle.load(f)
        
        # Count frames
        frame_counts.append(len(lm))
        
        # Count hands
        left_hand_count = sum(1 for frame in lm if frame['left_hand'] is not None)
        right_hand_count = sum(1 for frame in lm if frame['right_hand'] is not None)
        both_hands_count = sum(1 for frame in lm 
                              if frame['left_hand'] is not None and frame['right_hand'] is not None)
        no_hands_count = sum(1 for frame in lm 
                            if frame['left_hand'] is None and frame['right_hand'] is None)
        
        left_hand_counts.append(left_hand_count / len(lm))
        right_hand_counts.append(right_hand_count / len(lm))
        both_hands_counts.append(both_hands_count / len(lm))
        no_hands_counts.append(no_hands_count / len(lm))
    
    # Compute statistics
    print(f"Frame count statistics:")
    print(f"  - Mean: {np.mean(frame_counts):.2f}")
    print(f"  - Median: {np.median(frame_counts)}")
    print(f"  - Min: {np.min(frame_counts)}")
    print(f"  - Max: {np.max(frame_counts)}")
    
    print(f"\nHand detection statistics (percentage of frames):")
    print(f"  - Left hand: {np.mean(left_hand_counts) * 100:.2f}%")
    print(f"  - Right hand: {np.mean(right_hand_counts) * 100:.2f}%")
    print(f"  - Both hands: {np.mean(both_hands_counts) * 100:.2f}%")
    print(f"  - No hands: {np.mean(no_hands_counts) * 100:.2f}%")
    
    # Plot frame count distribution
    plt.figure(figsize=(10, 5))
    plt.hist(frame_counts, bins=20)
    plt.title('Distribution of Frame Counts')
    plt.xlabel('Number of Frames')
    plt.ylabel('Number of Videos')
    plt.grid(True)
    plt.show()

## Conclusion

In this notebook, we've explored the WLASL dataset and visualized sign language data. We've gained insights into the structure of the dataset, the distribution of videos per gloss, and the characteristics of the processed features.

Key findings:
- The dataset contains a diverse set of sign language gestures
- MediaPipe effectively extracts hand and body landmarks
- There is variability in the number of frames per video
- Hand detection is not perfect, with some frames missing hand landmarks

These insights will help us design better models for sign language recognition and translation.