In [None]:
import zipfile

zip_file = "/nas/Dataset/Phoenix/phoenix-2014-videos.zip"
extract_dir = "extracted_videos"  # Replace with your desired directory

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
  # Print list of files in the zip
  print("Files in the zip:")
  for filename in zip_ref.namelist():
    print(filename)

  # Extract all files to the extract_dir
  zip_ref.extractall(extract_dir)
  print(f"Videos extracted to: {extract_dir}")

In [None]:
import os
import gzip
import pickle
import numpy as np
import pandas as pd

def read_phoenix_split(split_path):
    """
    Read a single Phoenix-2014 dataset split file from gzipped pickle format
    
    Args:
        split_path (str): Path to the gzipped split file
        
    Returns:
        list: List of dictionaries containing sample information
    """
    try:
        with gzip.open(split_path, 'rb') as f:
            data = pickle.load(f)
        print(f"Successfully loaded split file: {len(data)} samples")
        return data
        
    except Exception as e:
        print(f"Error reading split file {split_path}: {str(e)}")
        return None

def analyze_split_data(split_data):
    """
    Analyze the content of a split dataset
    
    Args:
        split_data (list): List of dictionaries containing sample information
        
    Returns:
        dict: Statistics about the split
    """
    if not split_data:
        return None
    
    stats = {
        'num_samples': len(split_data),
        'fields_available': set().union(*[set(d.keys()) for d in split_data]),
        'sample_lengths': [],
        'glosses': set(),
        'signers': set() if 'signer' in split_data[0] else None
    }
    
    for sample in split_data:
        if 'num_frames' in sample:
            stats['sample_lengths'].append(sample['num_frames'])
        if 'gloss' in sample:
            stats['glosses'].add(sample['gloss'])
        if 'signer' in sample:
            stats['signers'].add(sample['signer'])
    
    if stats['sample_lengths']:
        stats['avg_length'] = np.mean(stats['sample_lengths'])
        stats['min_length'] = np.min(stats['sample_lengths'])
        stats['max_length'] = np.max(stats['sample_lengths'])
    
    return stats

def analyze_keypoints(keypoints):
    """
    Analyze the keypoint data structure and content
    
    Args:
        keypoints (dict): Dictionary containing keypoint data
        
    Returns:
        dict: Statistics about the keypoints
    """
    stats = {
        'num_samples': len(keypoints),
        'sample_key_format': list(keypoints.keys())[0],
    }
    
    # Analyze first sample
    sample_data = keypoints[stats['sample_key_format']]
    if 'keypoints' in sample_data:
        kp = sample_data['keypoints']
        stats.update({
            'shape': kp.shape,
            'num_frames': kp.shape[0],
            'num_keypoints': kp.shape[1],
            'dimensions': kp.shape[2],
            'min_value': float(np.min(kp)),
            'max_value': float(np.max(kp)),
            'mean_value': float(np.mean(kp))
        })
    
    return stats

def load_phoenix_dataset(config):
    """
    Load complete Phoenix dataset including splits and keypoints
    
    Args:
        config (dict): Configuration dictionary containing paths
            Required keys:
            - train: path to train split file
            - dev: path to dev split file
            - test: path to test split file
            - keypoint_file: path to keypoints pickle file
            
    Returns:
        tuple: (splits_dict, keypoints_dict, analysis_dict)
    """
    splits = {}
    analysis = {'splits': {}}
    
    # Read each split file
    for split_name in ['train', 'dev', 'test']:
        split_path = config[split_name]
        if os.path.exists(split_path):
            splits[split_name] = read_phoenix_split(split_path)
            analysis['splits'][split_name] = analyze_split_data(splits[split_name])
        else:
            print(f"Warning: Split file not found at {split_path}")
    
    # Read keypoints
    try:
        with open(config['keypoint_file'], 'rb') as f:
            keypoints = pickle.load(f)
        print(f"Successfully loaded keypoints for {len(keypoints)} samples")
        analysis['keypoints'] = analyze_keypoints(keypoints)
    except Exception as e:
        print(f"Error reading keypoint file: {str(e)}")
        keypoints = None
    
    return splits, keypoints, analysis

# Example usage:
if __name__ == "__main__":
    base_path = "data/phoenix-2014"  # Adjust this to your actual base path
    config = {
        'train': os.path.join(base_path, "phoenix-2014.train"),
        'dev': os.path.join(base_path, "phoenix-2014.dev"),
        'test': os.path.join(base_path, "phoenix-2014.test"),
        'keypoint_file': '/nas/Dataset/Phoenix/phoenix-2014-keypoints.pkl'
    }
    
    splits, keypoints, analysis = load_phoenix_dataset(config)
    
    # Print detailed analysis
    print("\nDATASET ANALYSIS:")
    print("=" * 50)
    
    # Split analysis
    for split_name, stats in analysis['splits'].items():
        if stats:
            print(f"\n{split_name.upper()} Split Statistics:")
            print(f"Number of samples: {stats['num_samples']}")
            print(f"Available fields: {', '.join(sorted(stats['fields_available']))}")
            if stats.get('avg_length'):
                print(f"Average sequence length: {stats['avg_length']:.1f} frames")
                print(f"Sequence length range: {stats['min_length']} - {stats['max_length']} frames")
            if stats['glosses']:
                print(f"Number of unique glosses: {len(stats['glosses'])}")
            if stats['signers']:
                print(f"Number of unique signers: {len(stats['signers'])}")
    
    # Keypoint analysis
    if 'keypoints' in analysis:
        kp_stats = analysis['keypoints']
        print("\nKEYPOINT Statistics:")
        print(f"Number of samples: {kp_stats['num_samples']}")
        print(f"Data shape: {kp_stats['shape']} (frames, keypoints, coordinates)")
        print(f"Value range: [{kp_stats['min_value']:.3f}, {kp_stats['max_value']:.3f}]")
        print(f"Mean value: {kp_stats['mean_value']:.3f}")

In [None]:
import os
import gzip
import pickle
import numpy as np
from pprint import pprint

def read_and_show_examples(split_path, num_examples=3):
    """
    Read and display examples from a split file
    
    Args:
        split_path (str): Path to the gzipped split file
        num_examples (int): Number of examples to display
    """
    try:
        with gzip.open(split_path, 'rb') as f:
            data = pickle.load(f)
        
        print(f"\nLoaded {len(data)} samples from {os.path.basename(split_path)}")
        print("\nFirst few examples:")
        print("=" * 80)
        
        for i, sample in enumerate(data[:num_examples]):
            print(f"\nExample {i+1}:")
            print("-" * 40)
            # Print each field in the sample
            for key, value in sample.items():
                if isinstance(value, (np.ndarray, list)) and len(str(value)) > 100:
                    print(f"{key}: {type(value)} with shape {np.array(value).shape}")
                else:
                    print(f"{key}: {value}")
        
        # Print all available fields in the dataset
        print("\nAll available fields in the dataset:")
        print(set().union(*[set(d.keys()) for d in data]))
        
        return data
        
    except Exception as e:
        print(f"Error reading split file {split_path}: {str(e)}")
        return None

if __name__ == "__main__":
    base_path = "data/phoenix-2014"
    splits = {
        'train': os.path.join(base_path, "phoenix-2014.train"),
        'dev': os.path.join(base_path, "phoenix-2014.dev"),
        'test': os.path.join(base_path, "phoenix-2014.test")
    }
    
    # Read and show examples from each split
    for split_name, split_path in splits.items():
        if os.path.exists(split_path):
            print(f"\n{split_name.upper()} SPLIT:")
            data = read_and_show_examples(split_path)
            
            if data:
                # Print some additional statistics
                print(f"\nAdditional statistics for {split_name} split:")
                print(f"Total number of samples: {len(data)}")
                if 'gloss' in data[0]:
                    unique_glosses = set(d['gloss'] for d in data)
                    print(f"Number of unique glosses: {len(unique_glosses)}")
                if 'signer' in data[0]:
                    unique_signers = set(d['signer'] for d in data)
                    print(f"Number of unique signers: {len(unique_signers)}")

In [None]:
import gzip
import pickle

def read_split_file(filepath):
    """Simple function to read and display contents of a Phoenix split file"""
    print(f"\nReading: {filepath}")
    print("-" * 50)
    
    try:
        with gzip.open(filepath, 'rb') as f:
            data = pickle.load(f)
        
        print(f"Total samples: {len(data)}")
        print("\nFirst 3 samples:")
        for i, sample in enumerate(data[:3]):
            print(f"\nSample {i+1}:")
            print(sample)
            
    except Exception as e:
        print(f"Error: {str(e)}")

# Read each split file
splits = [
    "data/phoenix-2014/phoenix-2014.train",
    "data/phoenix-2014/phoenix-2014.dev",
    "data/phoenix-2014/phoenix-2014.test"
]

for split_file in splits:
    read_split_file(split_file)