In [1]:
from datasets import load_dataset
import torchaudio
import torch
import os
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("Jzuluaga/atco2_corpus_1h")["test"].train_test_split(test_size=0.2)

In [3]:
# Config
main_dir = "STT/data"
audio_dir = os.path.join(main_dir, "audio")
ref_dir = os.path.join(main_dir, "ref")
audio_test_dir = os.path.join(main_dir, "audio_test")
ref_test_dir = os.path.join(main_dir, "ref_test")
sample_rate = 16000
test_ratio = 0.2  # Adjust as needed

In [4]:
import os
import torch
import torchaudio
from sklearn.model_selection import train_test_split

# === CONFIG ===
vad_audio_base = "VAD_Input/Audio"
vad_ground_dir = "VAD_Input/Ground"
sample_rate = 16000
test_ratio = 0.2
dev_ratio = 0.1  # portion of train set that goes to DEV

# Ensure directories exist
for d in [os.path.join(vad_audio_base, split) for split in ["TRAIN", "DEV", "TEST"]] + [vad_ground_dir]:
    os.makedirs(d, exist_ok=True)

# Combine dataset entries
if "all" in dataset:
    all_items = list(dataset["all"])
else:
    all_items = list(dataset["train"]) + list(dataset["test"])

# Filter valid items
valid_items = [
    item for item in all_items
    if all(k in item for k in ["id", "audio", "text", "segment_start_time", "segment_end_time"])
]

# Deduplicate by ID
unique_items = {str(item["id"]): item for item in valid_items}

# === SPLITTING ===
all_ids = list(unique_items.keys())
train_ids, test_ids = train_test_split(all_ids, test_size=test_ratio, random_state=42)
train_ids, dev_ids = train_test_split(train_ids, test_size=dev_ratio, random_state=42)

# Ensure disjoint sets
train_set = set(train_ids)
dev_set = set(dev_ids)
test_set = set(test_ids)

overlap = (train_set & dev_set) | (train_set & test_set) | (dev_set & test_set)
if overlap:
    raise ValueError(f"❌ Overlapping IDs across splits: {overlap}")

# === HELPER FUNCTION ===
def save_pair(item, split):
    split_dir = os.path.join(vad_audio_base, split.upper())
    os.makedirs(split_dir, exist_ok=True)

    item_id = str(item["id"])
    audio_path = os.path.join(split_dir, f"{item_id}.wav")
    ref_path = os.path.join(vad_ground_dir, f"{item_id}.txt")

    # Process audio data
    audio_array = item["audio"]["array"]
    
    # Ensure audio data is in the correct range [-1, 1]
    if audio_array.max() > 1.0 or audio_array.min() < -1.0:
        audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
    
    # Convert to correct format for torchaudio
    waveform = torch.tensor(audio_array, dtype=torch.float32).unsqueeze(0)
    
    # Validate waveform
    if torch.isnan(waveform).any():
        print(f"Warning: NaN values found in waveform for {item_id}")
        return
    
    if waveform.size(1) == 0:
        print(f"Warning: Empty waveform for {item_id}")
        return

    try:
        # Save audio with explicit format settings
        torchaudio.save(
            audio_path, 
            waveform, 
            sample_rate=sample_rate,
            encoding='PCM_S', 
            bits_per_sample=16
        )
        
        # Verify the saved file
        try:
            verification = torchaudio.load(audio_path)
            if verification[0].size() != waveform.size():
                print(f"Warning: Size mismatch in saved file for {item_id}")
        except Exception as e:
            print(f"Warning: Could not verify saved file for {item_id}: {e}")

        # Save text with STT-compatible format
        with open(ref_path, "w") as f:
            f.write(f"{item['segment_start_time']}\t{item['segment_end_time']}\t{item['text']}\n")
    
    except Exception as e:
        print(f"Error saving {item_id}: {e}")

# === WRITE TO DISK ===
for item_id in train_ids:
    save_pair(unique_items[item_id], "TRAIN")

for item_id in dev_ids:
    save_pair(unique_items[item_id], "DEV")

for item_id in test_ids:
    save_pair(unique_items[item_id], "TEST")

print(f"✅ STT-compatible VAD export complete:")
print(f"  TRAIN: {len(train_ids)}")
print(f"  DEV:   {len(dev_ids)}")
print(f"  TEST:  {len(test_ids)}")


✅ STT-compatible VAD export complete:
  TRAIN: 626
  DEV:   70
  TEST:  175


In [5]:
import os
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from pathlib import Path

def verify_file_matching():
    """
    Verify that all audio files in DEV, TEST, and TRAIN have matching ground truth files.
    Returns dictionaries of matched and unmatched files.
    """
    vad_audio_base = "VAD_Input/Audio"
    vad_ground_dir = "VAD_Input/Ground"
    splits = ["DEV", "TEST", "TRAIN"]
    
    results = {
        'matched': {},
        'unmatched': {}
    }
    
    # Get all ground truth files
    ground_truth_files = set(f.stem for f in Path(vad_ground_dir).glob('*.txt'))
    
    # Check each split
    for split in splits:
        audio_dir = Path(vad_audio_base) / split
        
        # Initialize results for this split
        results['matched'][split] = []
        results['unmatched'][split] = []
        
        # Check each audio file
        for audio_file in audio_dir.glob('*.wav'):
            if audio_file.stem in ground_truth_files:
                results['matched'][split].append(audio_file.name)
            else:
                results['unmatched'][split].append(audio_file.name)
    
    # Print summary
    print("=== File Matching Summary ===")
    for split in splits:
        matched = len(results['matched'][split])
        unmatched = len(results['unmatched'][split])
        total = matched + unmatched
        
        print(f"\n{split}:")
        print(f"  Total files: {total}")
        print(f"  Matched: {matched}")
        print(f"  Unmatched: {unmatched}")
        
        if unmatched > 0:
            print("\n  Unmatched files:")
            for file in results['unmatched'][split]:
                print(f"    - {file}")
    
    return results

# Run the verification
matching_results = verify_file_matching()

=== File Matching Summary ===

DEV:
  Total files: 70
  Matched: 70
  Unmatched: 0

TEST:
  Total files: 175
  Matched: 175
  Unmatched: 0

TRAIN:
  Total files: 626
  Matched: 626
  Unmatched: 0


In [None]:
from pathlib import Path
import random
from pydub import AudioSegment
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import os
import pandas as pd

def test_segmentation_random_files(n_files=5):
    """
    Test segmentation on n random files and verify that the split WAV files add up to the original file length.
    Also compares original and processed ground truth files.
    """
    # Setup paths
    input_base_dir = Path("VAD_Input")
    output_base_dir = Path("VAD_Output")
    splits = ["DEV", "TEST", "TRAIN"]
    
    # Collect all audio files that have been processed
    all_audio_files = []
    processed_files = set()
    
    # Find all processed files first
    for split in splits:
        speech_dir = output_base_dir / split / "speech"
        if speech_dir.exists():
            for file in speech_dir.glob("*.wav"):
                processed_files.add(file.stem)
    
    # Now find matching original files
    for split in splits:
        audio_dir = input_base_dir / "Audio" / split
        for file in audio_dir.glob("*.wav"):
            if file.stem in processed_files:
                all_audio_files.append((split, file))
    
    # Randomly select n files
    test_files = random.sample(all_audio_files, min(n_files, len(all_audio_files)))
    
    print(f"=== Testing {len(test_files)} Random Files ===\n")
    print("🔍 Verification checks:")
    print("  ✅ Duration check: Original duration = Speech duration + Non-speech duration")
    print("  ✅ Content check: Original audio segments properly sorted into speech/non-speech")
    print("  ✅ Ground truth verification: Original and processed ground truth files consistent\n")
    
    result_table = []
    
    for split, audio_file in test_files:
        input_gt_file = input_base_dir / "Ground" / f"{audio_file.stem}.txt"
        speech_output_file = output_base_dir / split / "speech" / f"{audio_file.stem}.wav"
        non_speech_output_file = output_base_dir / split / "non_speech" / f"{audio_file.stem}.wav"
        speech_gt_file = output_base_dir / "Ground" / f"{audio_file.stem}_speech.txt"
        non_speech_gt_file = output_base_dir / "Ground" / f"{audio_file.stem}_non_speech.txt"
        
        print(f"\n{'='*80}")
        print(f"📊 File: {audio_file.name} [{split}]")
        print(f"{'='*80}")
        
        # === LOAD ALL AUDIO FILES ===
        original_audio = AudioSegment.from_file(str(audio_file))
        original_duration_ms = len(original_audio)
        original_duration_sec = original_duration_ms / 1000.0
        
        speech_audio = AudioSegment.from_file(str(speech_output_file))
        speech_duration_ms = len(speech_audio)
        speech_duration_sec = speech_duration_ms / 1000.0
        
        non_speech_audio = AudioSegment.from_file(str(non_speech_output_file))
        non_speech_duration_ms = len(non_speech_audio)
        non_speech_duration_sec = non_speech_duration_ms / 1000.0
        
        combined_duration_ms = speech_duration_ms + non_speech_duration_ms
        difference_ms = abs(original_duration_ms - combined_duration_ms)
        
        # === DURATION VERIFICATION ===
        print("\n🕒 DURATION CHECK:")
        print(f"  Original audio: {timedelta(seconds=original_duration_sec)} ({original_duration_ms} ms)")
        print(f"  Speech audio:   {timedelta(seconds=speech_duration_sec)} ({speech_duration_ms} ms)")
        print(f"  Non-speech:     {timedelta(seconds=non_speech_duration_sec)} ({non_speech_duration_ms} ms)")
        print(f"  Speech + Non-speech: {timedelta(seconds=(combined_duration_ms/1000))} ({combined_duration_ms} ms)")
        
        if difference_ms <= 10:  # Allow small rounding differences
            print(f"  ✅ Durations match! Difference: {difference_ms} ms (within tolerance)")
            duration_ok = True
        else:
            print(f"  ❌ Durations DO NOT match! Difference: {difference_ms} ms")
            duration_ok = False
        
        # === GROUND TRUTH VERIFICATION ===
        print("\n📄 GROUND TRUTH COMPARISON:")
        
        # Read original ground truth
        orig_segments = []
        with open(input_gt_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    start, end = float(parts[0]), float(parts[1])
                    text = parts[2] if len(parts) > 2 else ""
                    orig_segments.append({'start': start, 'end': end, 'text': text})
        
        # Read speech ground truth
        speech_segments = []
        with open(speech_gt_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    start, end = float(parts[0]), float(parts[1])
                    text = parts[2] if len(parts) > 2 else ""
                    speech_segments.append({'start': start, 'end': end, 'text': text})
        
        # Read non-speech ground truth
        non_speech_segments = []
        with open(non_speech_gt_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    start, end = float(parts[0]), float(parts[1])
                    text = parts[2] if len(parts) > 2 else ""
                    non_speech_segments.append({'start': start, 'end': end, 'text': text})
        
        # Calculate total durations from ground truth
        orig_duration_from_gt = sum(seg['end'] - seg['start'] for seg in orig_segments)
        speech_duration_from_gt = sum(seg['end'] - seg['start'] for seg in speech_segments)
        non_speech_duration_from_gt = sum(seg['end'] - seg['start'] for seg in non_speech_segments)
        
        print(f"  Original ground truth segments: {len(orig_segments)}")
        print(f"  Speech ground truth segments: {len(speech_segments)}")
        print(f"  Non-speech ground truth segments: {len(non_speech_segments)}")
        print(f"  Original duration from GT: {orig_duration_from_gt:.2f}s")
        print(f"  Speech duration from GT: {speech_duration_from_gt:.2f}s")
        print(f"  Non-speech duration from GT: {non_speech_duration_from_gt:.2f}s")
        
        # Verify if speech segments in processed ground truth match original
        speech_gt_match = True
        for seg in speech_segments:
            match_found = False
            for orig_seg in orig_segments:
                # Use a small epsilon for float comparison
                if abs(seg['start'] - orig_seg['start']) < 0.001 and abs(seg['end'] - orig_seg['end']) < 0.001:
                    match_found = True
                    break
            if not match_found:
                speech_gt_match = False
                print(f"  ⚠️ Speech segment not found in original GT: {seg['start']} - {seg['end']}")
        
        if speech_gt_match:
            print("  ✅ Speech ground truth segments match original!")
        
        # Verify that combined speech and non-speech segments cover the entire audio
        combined_duration_from_gt = speech_duration_from_gt + non_speech_duration_from_gt
        gt_diff = abs(original_duration_sec - combined_duration_from_gt)
        
        if gt_diff < 0.1:  # Allow small rounding differences
            print(f"  ✅ Ground truth durations add up! Difference: {gt_diff:.3f}s")
            gt_ok = True
        else:
            print(f"  ❌ Ground truth durations DO NOT add up! Difference: {gt_diff:.3f}s")
            gt_ok = False
        
        # === TIMELINE VISUALIZATION ===
        print("\n⏱️ TIMELINE VIEW:")
        print("  Original segments:")
        timeline = []
        current_time = 0
        for i, seg in enumerate(sorted(orig_segments, key=lambda x: x['start'])):
            if current_time < seg['start']:
                timeline.append({
                    'start': current_time,
                    'end': seg['start'],
                    'type': 'NON-SPEECH',
                    'text': ''
                })
            timeline.append({
                'start': seg['start'],
                'end': seg['end'],
                'type': 'SPEECH',
                'text': seg['text']
            })
            current_time = seg['end']
        
        if current_time < original_duration_sec:
            timeline.append({
                'start': current_time,
                'end': original_duration_sec,
                'type': 'NON-SPEECH',
                'text': ''
            })
        
        for event in timeline:
            duration = event['end'] - event['start']
            start_time = str(timedelta(seconds=event['start'])).split('.')[0]
            end_time = str(timedelta(seconds=event['end'])).split('.')[0]
            
            if event['type'] == 'SPEECH':
                print(f"  {start_time} → {end_time} ({duration:.2f}s): 🗣️ {event['type']}")
                if event['text']:
                    print(f"    📝 {event['text'][:60]}{'...' if len(event['text']) > 60 else ''}")
            else:
                print(f"  {start_time} → {end_time} ({duration:.2f}s): 🔇 {event['type']}")
        
        # === FILE PATHS FOR REFERENCE ===
        print("\n📁 FILE REFERENCES:")
        print(f"  Original audio:        {audio_file}")
        print(f"  Original ground truth: {input_gt_file}")
        print(f"  Speech audio:          {speech_output_file}")
        print(f"  Non-speech audio:      {non_speech_output_file}")
        print(f"  Speech ground truth:   {speech_gt_file}")
        print(f"  Non-speech ground truth: {non_speech_gt_file}")
        
        # Add to results table
        result_table.append({
            'File': audio_file.name,
            'Split': split,
            'Original (s)': original_duration_sec,
            'Speech (s)': speech_duration_sec,
            'Non-speech (s)': non_speech_duration_sec,
            'Sum (s)': speech_duration_sec + non_speech_duration_sec,
            'Difference (ms)': difference_ms,
            'Duration OK': '✅' if duration_ok else '❌',
            'GT OK': '✅' if gt_ok else '❌'
        })
    
    # === SUMMARY TABLE ===
    if result_table:
        print("\n\n📊 SUMMARY OF TESTED FILES:")
        df = pd.DataFrame(result_table)
        print(df.to_string(index=False))
        
        # Overall verdict
        all_duration_ok = all(result['Duration OK'] == '✅' for result in result_table)
        all_gt_ok = all(result['GT OK'] == '✅' for result in result_table)
        
        print("\n🏁 FINAL VERDICT:")
        if all_duration_ok and all_gt_ok:
            print("  ✅ ALL TESTS PASSED! Audio splitting appears to be working correctly.")
        else:
            print("  ❌ SOME TESTS FAILED. Check the details above.")
            if not all_duration_ok:
                print("    - Duration matching issues detected")
            if not all_gt_ok:
                print("    - Ground truth consistency issues detected")

if __name__ == "__main__":
    random.seed(42)  # For reproducibility
    test_segmentation_random_files()

=== Testing 5 Random Files ===


File: atco2_test-set-1h_LSZH_ZURICH_ApronS_121_75MHz_20210414_114326-A__000007-000684.wav
--------------------------------------------------------------------------------
Total duration: 0:00:06.770000

Timeline:
  0:00:00 → 0:00:00.070000 (0.07s): NON-SPEECH
  0:00:00.070000 → 0:00:06.840000 (6.77s): SPEECH
    Text: vista jet four three seven turn left via taxiway alfa echo six hold short runway one six

To verify this file:
1. Open audio file: VAD_Input\Audio\TRAIN\atco2_test-set-1h_LSZH_ZURICH_ApronS_121_75MHz_20210414_114326-A__000007-000684.wav
2. Open ground truth: VAD_Input\Ground\atco2_test-set-1h_LSZH_ZURICH_ApronS_121_75MHz_20210414_114326-A__000007-000684.txt
3. Listen to the audio and check if segments match the timeline above
--------------------------------------------------------------------------------

File: atco2_test-set-1h_LSGS_SION_Ground_Control_121_7MHz_20210502_134810-A__000028-000180.wav
----------------------------------------

In [7]:
from pathlib import Path
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from datetime import timedelta

def process_audio_file(audio_file, gt_file, speech_dir, non_speech_dir):
    """
    Process a single audio file and split it into speech/non-speech segments.
    Returns the processed segments and their metadata.
    """
    # Load audio file
    audio = AudioSegment.from_file(str(audio_file))
    total_duration = len(audio) / 1000.0
    
    # Read and sort ground truth segments
    speech_segments = []
    with open(gt_file, 'r') as f:
        for line in f:
            start, end, text = line.strip().split('\t')
            speech_segments.append({
                'start': float(start),
                'end': float(end),
                'text': text
            })
    speech_segments.sort(key=lambda x: x['start'])
    
    # Process speech segments
    combined_speech = AudioSegment.empty()
    speech_metadata = []
    for seg in speech_segments:
        start_ms = int(seg['start'] * 1000)
        end_ms = int(seg['end'] * 1000)
        segment = audio[start_ms:end_ms]
        combined_speech += segment
        speech_metadata.append(seg)
    
    # Save speech segments
    speech_output = speech_dir / f"{audio_file.stem}_speech.wav"
    combined_speech.export(str(speech_output), format="wav", parameters=["-ac", "1"])
    
    # Process non-speech segments
    current_time = 0.0
    combined_non_speech = AudioSegment.empty()
    non_speech_metadata = []
    
    for seg in speech_segments:
        if current_time < seg['start']:
            start_ms = int(current_time * 1000)
            end_ms = int(seg['start'] * 1000)
            non_speech_segment = audio[start_ms:end_ms]
            combined_non_speech += non_speech_segment
            non_speech_metadata.append({
                'start': current_time,
                'end': seg['start']
            })
        current_time = seg['end']
    
    # Handle final non-speech segment
    if current_time < total_duration:
        start_ms = int(current_time * 1000)
        end_ms = int(total_duration * 1000)
        non_speech_segment = audio[start_ms:end_ms]
        combined_non_speech += non_speech_segment
        non_speech_metadata.append({
            'start': current_time,
            'end': total_duration
        })
    
    # Save non-speech segments
    non_speech_output = non_speech_dir / f"{audio_file.stem}_non_speech.wav"
    combined_non_speech.export(str(non_speech_output), format="wav", parameters=["-ac", "1"])
    
    return {
        'total_duration': total_duration,
        'speech_segments': speech_metadata,
        'non_speech_segments': non_speech_metadata,
        'speech_file': speech_output,
        'non_speech_file': non_speech_output
    }

def print_metadata(audio_file, metadata):
    """Print formatted metadata for the processed file."""
    print("\n=== Metadata ===")
    print(f"Original file: {audio_file.name}")
    print(f"Original duration: {timedelta(seconds=metadata['total_duration'])}")
    
    print("\nSpeech segments:")
    for seg in metadata['speech_segments']:
        print(f"[{timedelta(seconds=seg['start'])} → {timedelta(seconds=seg['end'])}] "
              f"Text: {seg['text']}")
    
    print("\nNon-speech segments:")
    for seg in metadata['non_speech_segments']:
        print(f"[{timedelta(seconds=seg['start'])} → {timedelta(seconds=seg['end'])}]")
    
    print(f"\nOutput files:")
    print(f"Speech file: {metadata['speech_file'].name}")
    print(f"Non-speech file: {metadata['non_speech_file'].name}")

def test_single_file_split(audio_path, gt_path, output_dir="test_output"):
    """Test splitting functionality on a single audio/ground truth pair."""
    try:
        # Setup paths
        audio_file = Path(audio_path)
        gt_file = Path(gt_path)
        output_dir = Path(output_dir)
        speech_dir = output_dir / "speech"
        non_speech_dir = output_dir / "non_speech"
        
        # Create output directories
        speech_dir.mkdir(parents=True, exist_ok=True)
        non_speech_dir.mkdir(parents=True, exist_ok=True)
        
        print("\n=== Processing Single File ===")
        print(f"Audio file: {audio_file.name}")
        print(f"Ground truth: {gt_file.name}")
        
        # Process the audio file
        metadata = process_audio_file(audio_file, gt_file, speech_dir, non_speech_dir)
        
        # Print metadata after processing is complete
        print_metadata(audio_file, metadata)
        
        return True
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return False

if __name__ == "__main__":
    # Example usage
    audio_file = "VAD_Input/Audio/DEV/atco2_test-set-1h_LKPR_RUZYNE_Radar_120_520MHz_20201026_145941-G__001153-001308.wav"
    gt_file = "VAD_Input/Ground/atco2_test-set-1h_LKPR_RUZYNE_Radar_120_520MHz_20201026_145941-G__001153-001308.txt"
    
    success = test_single_file_split(audio_file, gt_file)
    if not success:
        print("Processing failed!")


=== Processing Single File ===
Audio file: atco2_test-set-1h_LKPR_RUZYNE_Radar_120_520MHz_20201026_145941-G__001153-001308.wav
Ground truth: atco2_test-set-1h_LKPR_RUZYNE_Radar_120_520MHz_20201026_145941-G__001153-001308.txt
Error processing file: [Errno 2] No such file or directory: 'VAD_Input\\Audio\\DEV\\atco2_test-set-1h_LKPR_RUZYNE_Radar_120_520MHz_20201026_145941-G__001153-001308.wav'
Processing failed!
