In [2]:
from module.data_processing import SignLanguageDataset

In [5]:
# Check dataset for inconsistent tensor shapes
import torch

def check_dataset_consistency(dataset_name):
    dataset = SignLanguageDataset(dataset_name)
    print(f"Total samples in {dataset_name}: {len(dataset)}")
    
    # Track shapes
    shapes = {}
    
    # Check first few samples
    for i in range(min(len(dataset), 20)):
        sample, label = dataset[i]
        shape = sample.shape
        if shape in shapes:
            shapes[shape] += 1
        else:
            shapes[shape] = 1
            print(f"Found new shape: {shape} at index {i} with label {label}")
    
    print(f"Shape distribution: {shapes}")
    
    # Check some random samples if dataset is large
    if len(dataset) > 20:
        import random
        indices = random.sample(range(20, len(dataset)), min(10, len(dataset)-20))
        for i in indices:
            sample, label = dataset[i]
            shape = sample.shape
            if shape not in shapes:
                shapes[shape] = 1
                print(f"Found new shape: {shape} at index {i} with label {label}")
    
    return shapes

print("Checking training dataset:")
train_shapes = check_dataset_consistency("data_train")

print("\nChecking test dataset:")
test_shapes = check_dataset_consistency("data_test")

Checking training dataset:
Total samples in data_train: 36
Found new shape: torch.Size([30, 225]) at index 0 with label 0
Shape distribution: {torch.Size([30, 225]): 20}

Checking test dataset:
Total samples in data_test: 9
Found new shape: torch.Size([30, 225]) at index 0 with label 0
Shape distribution: {torch.Size([30, 225]): 9}


In [4]:
# Find all samples with shape [30, 126] and identify their paths
import os
import torch
import numpy as np

def find_samples_with_shape(dataset_path, target_shape):
    found_samples = []
    
    # Loop through all actions
    for action in os.listdir(dataset_path):
        if action.startswith('.'):
            continue
        
        action_path = os.path.join(dataset_path, action)
        if not os.path.isdir(action_path):
            continue
        
        # Loop through all sequences
        for sequence in os.listdir(action_path):
            if sequence.startswith('.'):
                continue
                
            sequence_path = os.path.join(action_path, sequence)
            if not os.path.isdir(sequence_path):
                continue
            
            # Check if this is a valid sequence folder with .npy files
            npy_files = [f for f in os.listdir(sequence_path) if f.endswith('.npy')]
            if not npy_files:
                continue
            
            # Check the shape of the data
            sequence_data = []
            for frame_file in sorted(npy_files):
                frame_path = os.path.join(sequence_path, frame_file)
                frame_data = np.load(frame_path)
                sequence_data.append(frame_data)
                
            sequence_data = torch.tensor(np.array(sequence_data))
            
            if sequence_data.shape == target_shape:
                found_samples.append({
                    'path': sequence_path,
                    'action': action,
                    'sequence': sequence,
                    'shape': sequence_data.shape,
                    'num_frames': len(npy_files)
                })
    
    return found_samples

# Find samples with shape [30, 126] in both train and test datasets
target_shape = torch.Size([30, 126])

print("Finding samples with shape", target_shape, "in training data:")
train_samples = find_samples_with_shape("data_train", target_shape)
for idx, sample in enumerate(train_samples):
    print(f"{idx+1}. {sample['action']}/{sample['sequence']} - Shape: {sample['shape']}, Frames: {sample['num_frames']}")

print("\nFinding samples with shape", target_shape, "in test data:")
test_samples = find_samples_with_shape("data_test", target_shape)
for idx, sample in enumerate(test_samples):
    print(f"{idx+1}. {sample['action']}/{sample['sequence']} - Shape: {sample['shape']}, Frames: {sample['num_frames']}")

# Check one of these samples to understand what's different
if train_samples or test_samples:
    sample_path = train_samples[0]['path'] if train_samples else test_samples[0]['path']
    frame_file = sorted(os.listdir(sample_path))[0]  # First frame
    frame_path = os.path.join(sample_path, frame_file)
    frame_data = np.load(frame_path)
    
    print(f"\nAnalyzing sample frame from: {frame_path}")
    print(f"Frame shape: {frame_data.shape}")
    print(f"Data type: {frame_data.dtype}")
    
    # Check if we have less data points than expected
    if len(frame_data) == 126:
        print("This frame has exactly 126 data points (likely missing pose data)")
        left_hand = frame_data[:63]
        right_hand = frame_data[63:126]
        print(f"Left hand data points: {len(left_hand)}")
        print(f"Right hand data points: {len(right_hand)}")

Finding samples with shape torch.Size([30, 126]) in training data:

Finding samples with shape torch.Size([30, 126]) in test data:
