## Generate GT RTTM with ID-based Splitting

Split videos by participant ID, ensuring all videos from the same child stay in the same split (train/dev/test).

In [1]:
import json
import pandas as pd
from pathlib import Path
from glob import glob
from collections import Counter, defaultdict
import numpy as np

# Load the CSV file with ID mappings
csv_path = "/home/nele_pauline_suffo/ProcessedData/childlens_annotations/keeper/v1/childlens_participant_info.csv"
id_mapping_df = pd.read_csv(csv_path, sep=';')
# Create mapping from file_name to ID
file_name_to_id = dict(zip(id_mapping_df['file_name'].astype(str), id_mapping_df['child_id'].astype(str)))

print(f"Loaded {len(file_name_to_id)} video-to-ID mappings")
print("Sample mappings:", dict(list(file_name_to_id.items())[:5]))

# Folder and output paths
input_folder = "/home/nele_pauline_suffo/ProcessedData/childlens_annotations/keeper/v1"
output_dir = Path("/home/nele_pauline_suffo/ProcessedData/audio_cls_input")
output_dir.mkdir(exist_ok=True)

valid_event_ids = {"child_talking", "other_person_talking", "overheard_speech", "singing/humming"}

# Step 1: Load all JSON files and collect metadata with ID grouping
id_to_files = defaultdict(list)
files_without_id = []

json_files = glob(f"{input_folder}/*.json")
for json_file in json_files:
    try:
        with open(json_file, "r") as f:
            data = json.load(f)
        
        video_name = data.get('video_name', '')
        if not video_name:
            print(f"Warning: No video_name found in {json_file}")
            continue
            
        annotations = data.get('annotations', [])
        if not annotations:
            print(f"Warning: No annotations found in {json_file}")
            continue
            
        duration = max(ann.get('endTime', 0) for ann in annotations)
        
        # Extract file_name from video_name (remove extension)
        file_name = video_name.replace('.MP4', '').replace('.mp4', '')
        participant_id = file_name_to_id.get(file_name, None)
        
        file_info = {
            "path": json_file,
            "uri": video_name,
            "file_name": file_name,
            "duration": duration,
            "participant_id": participant_id
        }
        
        if participant_id:
            id_to_files[participant_id].append(file_info)
        else:
            files_without_id.append(file_info)
            print(f"Warning: No ID found for file_name '{file_name}' from video '{video_name}'")
            
    except Exception as e:
        print(f"Skipping file {json_file} due to error: {e}")

print(f"\nFound {len(id_to_files)} unique participant IDs")
print(f"Files without ID: {len(files_without_id)}")

# Step 2: Calculate total duration per ID and sort IDs by total duration
id_durations = {}
for participant_id, files in id_to_files.items():
    total_duration = sum(f["duration"] for f in files)
    id_durations[participant_id] = total_duration

# Sort IDs by total duration (descending) for balanced splitting
sorted_ids = sorted(id_durations.keys(), key=lambda x: id_durations[x], reverse=True)

print("\nTop 10 IDs by total duration:")
for i, pid in enumerate(sorted_ids[:10]):
    print(f"  {i+1}. ID {pid}: {id_durations[pid]:.1f}s ({len(id_to_files[pid])} videos)")

# Step 3: Split IDs into train/dev/test while maintaining ratios
total_duration = sum(id_durations.values())
target_train = 0.8 * total_duration
target_dev = 0.1 * total_duration
target_test = 0.1 * total_duration

train_ids, dev_ids, test_ids = [], [], []
train_duration, dev_duration, test_duration = 0, 0, 0

for participant_id in sorted_ids:
    duration = id_durations[participant_id]
    
    # Assign to the split that needs the most duration relative to its target
    train_need = max(0, target_train - train_duration)
    dev_need = max(0, target_dev - dev_duration)
    test_need = max(0, target_test - test_duration)
    
    if train_need >= dev_need and train_need >= test_need:
        train_ids.append(participant_id)
        train_duration += duration
    elif dev_need >= test_need:
        dev_ids.append(participant_id)
        dev_duration += duration
    else:
        test_ids.append(participant_id)
        test_duration += duration

# Flatten files by split
train_files = [f for pid in train_ids for f in id_to_files[pid]]
dev_files = [f for pid in dev_ids for f in id_to_files[pid]]
test_files = [f for pid in test_ids for f in id_to_files[pid]]

splits = {
    "train": train_files,
    "dev": dev_files,
    "test": test_files
}

print(f"\nüìä ID-based Split Results:")
print(f"Train: {len(train_ids)} IDs, {len(train_files)} files, {train_duration:.1f}s ({train_duration/total_duration*100:.1f}%)")
print(f"Dev:   {len(dev_ids)} IDs, {len(dev_files)} files, {dev_duration:.1f}s ({dev_duration/total_duration*100:.1f}%)")
print(f"Test:  {len(test_ids)} IDs, {len(test_files)} files, {test_duration:.1f}s ({test_duration/total_duration*100:.1f}%)")

print(f"\nTrain IDs: {train_ids}")
print(f"Dev IDs: {dev_ids}")
print(f"Test IDs: {test_ids}")

Loaded 192 video-to-ID mappings
Sample mappings: {'100898': '266216', '106910': '271693', '108844': '265891', '114534': '264362', '117071': '265674'}

Found 58 unique participant IDs
Files without ID: 0

Top 10 IDs by total duration:
  1. ID 280599: 13110.3s (9 videos)
  2. ID 262222: 8276.9s (5 videos)
  3. ID 262472: 7608.9s (5 videos)
  4. ID 260439: 7336.4s (5 videos)
  5. ID 266216: 6803.4s (6 videos)
  6. ID 266686: 5794.1s (5 videos)
  7. ID 265619: 5767.4s (5 videos)
  8. ID 279536: 5543.6s (4 videos)
  9. ID 266799: 5443.0s (6 videos)
  10. ID 262381: 5400.6s (3 videos)

üìä ID-based Split Results:
Train: 38 IDs, 144 files, 157380.8s (79.9%)
Dev:   10 IDs, 23 files, 19953.4s (10.1%)
Test:  10 IDs, 25 files, 19716.5s (10.0%)

Train IDs: ['280599', '262222', '262472', '260439', '266216', '266686', '265619', '279536', '266799', '262381', '260730', '262564', '262703', '260777', '280429', '265566', '265943', '262020', '260455', '263229', '265674', '271693', '267139', '264362', '26

In [2]:
# Step 4: Process each split and write RTTM files with ID-based splitting
all_df_rows = []
all_rttm_lines_combined = []
speaker_durations = Counter()  # Track total durations
split_speaker_durations = defaultdict(Counter)  # Track durations per split per class

for split_name, files_in_split in splits.items():
    rttm_lines_split = []
    
    for f_info in files_in_split:
        try:
            with open(f_info["path"], "r") as file_handle:
                data = json.load(file_handle)
            
            uri = data.get('video_name', '')
            participant_id = f_info.get('participant_id', 'UNKNOWN')
            
            # Process annotations
            for annotation in data.get('annotations', []):
                try:
                    event_id = annotation.get('eventId', '')
                    
                    if event_id not in valid_event_ids:
                        continue
                        
                    # Get timing information
                    start_sec = annotation.get('startTime', 0)
                    end_sec = annotation.get('endTime', 0)
                    duration_sec = end_sec - start_sec
                    
                    if duration_sec <= 0:
                        continue
                    
                    # Map event IDs to speaker IDs
                    speaker_id = "NA"
                    fields = annotation.get('fields', {})
                    
                    if event_id in ["child_talking", "singing/humming"]:
                        speaker_id = "KCHI"
                    elif event_id == "other_person_talking":
                        age_group = fields.get("1st Person Age Group", "")
                        gender = fields.get("1st Person Gender", "")
                        
                        if age_group in ["Child", "Infant"]:
                            speaker_id = "CHI"
                        elif age_group in ["Adult", "Adolescent"]:
                            if gender == "Female":
                                speaker_id = "FEM"
                            elif gender == "Male":
                                speaker_id = "MAL"
                    elif event_id == "overheard_speech":
                        speaker_id = "SPEECH"     

                    if speaker_id in ["KCHI", "CHI", "FEM", "MAL", "SPEECH"]:
                        # RTTM line with participant ID in the last field
                        rttm_line = f"SPEAKER {uri} 1 {start_sec:.3f} {duration_sec:.3f} <NA> <NA> {speaker_id} <NA> <NA>"
                        rttm_lines_split.append(rttm_line)
                        all_rttm_lines_combined.append(rttm_line)
                        
                        # Add duration tracking (total and per split)
                        duration_minutes = duration_sec / 60.0
                        speaker_durations[speaker_id] += duration_minutes
                        split_speaker_durations[split_name][speaker_id] += duration_minutes
                        
                        # Add additional SPEECH line only for non-SPEECH speakers
                        if speaker_id != "SPEECH":
                            rttm_line_speech = f"SPEAKER {uri} 1 {start_sec:.3f} {duration_sec:.3f} <NA> <NA> SPEECH <NA> <NA>"
                            rttm_lines_split.append(rttm_line_speech)
                            all_rttm_lines_combined.append(rttm_line_speech)
                            speaker_durations["SPEECH"] += duration_minutes
                            split_speaker_durations[split_name]["SPEECH"] += duration_minutes
                        
                        # DataFrame row for specific voice type
                        row_data_specific = {
                            "audio_file_name": uri,
                            "Utterance_Start": round(start_sec, 3),
                            "Utterance_Duration": round(duration_sec, 3),
                            "Voice_type": speaker_id,
                            "Utterance_End": round(end_sec, 3),
                            "Participant_ID": participant_id,
                            "Split": split_name
                        }
                        all_df_rows.append(row_data_specific)

                        # DataFrame row for SPEECH (only for non-SPEECH speakers)
                        if speaker_id != "SPEECH":
                            row_data_speech = {
                                "audio_file_name": uri,
                                "Utterance_Start": round(start_sec, 3),
                                "Utterance_Duration": round(duration_sec, 3),
                                "Voice_type": "SPEECH",
                                "Utterance_End": round(end_sec, 3),
                                "Participant_ID": participant_id,
                                "Split": split_name
                            }
                            all_df_rows.append(row_data_speech)
                        
                except Exception as e:
                    print(f"Skipping annotation in {f_info['path']}: {e}")
                    
        except Exception as e:
            print(f"Error reading file {f_info['path']}: {e}")
    
    # Save RTTM file for current split
    rttm_path_split = output_dir / f"{split_name}.rttm"
    with open(rttm_path_split, "w") as out_f:
        for line in rttm_lines_split:
            out_f.write(line + "\n")
    print(f"üìù RTTM file for {split_name} split saved to {rttm_path_split}")

# Save complete RTTM file
complete_rttm_path = output_dir / "complete.rttm"
with open(complete_rttm_path, "w") as out_f:
    for line in all_rttm_lines_combined:
        out_f.write(line + "\n")
print(f"üìù Complete RTTM file saved to {complete_rttm_path}")

# Create and save DataFrame
if all_df_rows:
    combined_df = pd.DataFrame(all_df_rows)
    combined_df = combined_df[["audio_file_name", "Utterance_Start", "Utterance_Duration", 
                              "Voice_type", "Utterance_End", "Participant_ID", "Split"]]
    
    df_pkl_path = output_dir / "annotations_gt_id_split.pkl"
    combined_df.to_pickle(df_pkl_path)
    print(f"‚úÖ Combined DataFrame saved to {df_pkl_path} ({len(combined_df)} rows)")
else:
    print("‚ÑπÔ∏è No data to create DataFrame.")

# Save UEM files
all_files_flat = train_files + dev_files + test_files
uem_lines = []
for f in all_files_flat:
    uri = f["uri"]
    start_time = 0.000
    end_time = f["duration"]
    uem_lines.append(f"{uri} 1 {start_time:.3f} {end_time:.3f}")

uem_path = output_dir / "complete.uem"
with open(uem_path, "w") as uem_file:
    for line in uem_lines:
        uem_file.write(line + "\n")
print(f"‚úÖ Combined UEM file saved to {uem_path}")

# Generate split-specific files (.lst and .uem)
for split_name, files in splits.items():
    # .lst file
    lst_path = output_dir / f"{split_name}.lst"
    with open(lst_path, "w") as lst_file:
        for f in files:
            lst_file.write(f"{f['uri']}\n")
    
    # .uem file
    uem_path = output_dir / f"{split_name}.uem"
    with open(uem_path, "w") as uem_file:
        for f in files:
            uri = f["uri"]
            start = 0
            end = f["duration"]
            uem_line = f"{uri} 1 {start:.3f} {end:.3f}"
            uem_file.write(uem_line + "\n")

print("‚úÖ .lst and .uem files created for all splits.")

# Summary
print(f"\n‚úÖ Total processed files: {len(all_files_flat)}")

print("\nüéôÔ∏è Total speaker durations across all splits (minutes):")
for speaker_id in ['KCHI', 'CHI', 'FEM', 'MAL', 'SPEECH']:
    duration_minutes = speaker_durations[speaker_id]
    print(f"  {speaker_id}: {duration_minutes:.2f} minutes")

print("\nüìä Speaker durations per split (minutes):")
for split_name in ["train", "dev", "test"]:
    print(f"\n  {split_name.upper()}:")
    for speaker_id in ['KCHI', 'CHI', 'FEM', 'MAL', 'SPEECH']:
        duration_minutes = split_speaker_durations[split_name][speaker_id]
        total_duration = speaker_durations[speaker_id]
        percentage = (duration_minutes / total_duration * 100) if total_duration > 0 else 0
        print(f"    {speaker_id}: {duration_minutes:.2f} min ({percentage:.1f}% of total {speaker_id})")

print(f"\nüìä Final split summary:")
print(f"  Train: {len(train_ids)} IDs, {len(train_files)} files")
print(f"  Dev:   {len(dev_ids)} IDs, {len(dev_files)} files") 
print(f"  Test:  {len(test_ids)} IDs, {len(test_files)} files")
print(f"  Total: {len(id_to_files)} unique IDs, {len(all_files_flat)} files")

üìù RTTM file for train split saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/train.rttm
üìù RTTM file for dev split saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/dev.rttm
üìù RTTM file for test split saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/test.rttm
üìù Complete RTTM file saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/complete.rttm
‚úÖ Combined DataFrame saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/annotations_gt_id_split.pkl (48519 rows)
‚úÖ Combined UEM file saved to /home/nele_pauline_suffo/ProcessedData/audio_cls_input/complete.uem
‚úÖ .lst and .uem files created for all splits.

‚úÖ Total processed files: 192

üéôÔ∏è Total speaker durations across all splits (minutes):
  KCHI: 963.44 minutes
  CHI: 53.55 minutes
  FEM: 492.96 minutes
  MAL: 223.41 minutes
  SPEECH: 2158.24 minutes

üìä Speaker durations per split (minutes):

  TRAIN:
    KCHI: 744.64 min (77.3% of total KCHI)
    C

## Copy Test Files to Destination Folder

Copy all audio files listed in test.lst from source directory to destination directory.

In [None]:
import os
import shutil
from pathlib import Path

# Define source and destination directories
source_dir = Path("/home/nele_pauline_suffo/ProcessedData/childlens_audio")  # Folder X - where audio files are stored
destination_dir = Path("/home/nele_pauline_suffo/ProcessedData/childlens_audio/childlens_audio_test")  # Folder Y - where to copy test files
test_lst_path = output_dir / "test.lst"  # Path to test.lst file

# Create destination directory if it doesn't exist
destination_dir.mkdir(parents=True, exist_ok=True)

print(f"Source directory: {source_dir}")
print(f"Destination directory: {destination_dir}")
print(f"Test list file: {test_lst_path}")

# Read test.lst file to get list of video files
if test_lst_path.exists():
    with open(test_lst_path, 'r') as f:
        test_video_files = [line.strip() for line in f.readlines() if line.strip()]
    
    print(f"\nFound {len(test_video_files)} video files in test.lst")
    
    # Copy each audio file
    copied_files = []
    missing_files = []
    
    for video_file in test_video_files:
        # Convert video filename to audio filename (MP4 -> wav)
        audio_filename = video_file.replace('.MP4', '.MP4.wav').replace('.mp4', '.mp4.wav')
        
        source_path = source_dir / audio_filename
        destination_path = destination_dir / audio_filename
        
        if source_path.exists():
            try:
                shutil.copy2(source_path, destination_path)
                copied_files.append(audio_filename)
                print(f"‚úÖ Copied: {audio_filename}")
            except Exception as e:
                print(f"‚ùå Error copying {audio_filename}: {e}")
        else:
            missing_files.append(audio_filename)
            print(f"‚ö†Ô∏è Missing: {audio_filename}")
    
    # Summary
    print(f"\nüìä Copy Summary:")
    print(f"  Successfully copied: {len(copied_files)} files")
    print(f"  Missing files: {len(missing_files)} files")
    print(f"  Total expected: {len(test_video_files)} files")
    
    if missing_files:
        print(f"\n‚ö†Ô∏è Missing files:")
        for missing in missing_files[:10]:  # Show first 10 missing files
            print(f"    {missing}")
        if len(missing_files) > 10:
            print(f"    ... and {len(missing_files) - 10} more")
    
    print(f"\n‚úÖ Test audio files copied to: {destination_dir}")
    
else:
    print(f"‚ùå test.lst file not found at: {test_lst_path}")
    print("Make sure you've run the previous cells to generate the test.lst file.")

Source directory: /home/nele_pauline_suffo/ProcessedData/childlens_audio
Destination directory: /home/nele_pauline_suffo/ProcessedData/childlens_audio_test
Test list file: /home/nele_pauline_suffo/ProcessedData/audio_cls_input/test.lst

Found 25 video files in test.lst
‚úÖ Copied: 252685.MP4.wav
‚úÖ Copied: 252685.MP4.wav
‚úÖ Copied: 306565.MP4.wav
‚úÖ Copied: 306565.MP4.wav
‚úÖ Copied: 365908.MP4.wav
‚úÖ Copied: 365908.MP4.wav
‚úÖ Copied: 282498.MP4.wav
‚úÖ Copied: 282498.MP4.wav
‚úÖ Copied: 417338.MP4.wav
‚úÖ Copied: 417338.MP4.wav
‚úÖ Copied: 384179.MP4.wav
‚úÖ Copied: 384179.MP4.wav
‚úÖ Copied: 326740.MP4.wav
‚úÖ Copied: 326740.MP4.wav
‚úÖ Copied: 610898.MP4.wav
‚úÖ Copied: 610898.MP4.wav
‚úÖ Copied: 403769.MP4.wav
‚úÖ Copied: 403769.MP4.wav
‚úÖ Copied: 519475.MP4.wav
‚úÖ Copied: 519475.MP4.wav
‚úÖ Copied: 512533.MP4.wav
‚úÖ Copied: 512533.MP4.wav
‚úÖ Copied: 488644.MP4.wav
‚úÖ Copied: 488644.MP4.wav
‚úÖ Copied: 261047.MP4.wav
‚úÖ Copied: 560558.MP4.wav
‚úÖ Copied: 261047.MP4.wav
‚