In [1]:
from datasets import load_dataset
import torchaudio
import torch
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
dataset = load_dataset("Jzuluaga/atco2_corpus_1h")["test"].train_test_split(test_size=0.2)

In [3]:
# Config
main_dir = "STT/data"
audio_dir = os.path.join(main_dir, "audio")
ref_dir = os.path.join(main_dir, "ref")
audio_test_dir = os.path.join(main_dir, "audio_test")
ref_test_dir = os.path.join(main_dir, "ref_test")
sample_rate = 16000
test_ratio = 0.2  # Adjust as needed

In [4]:
import os
import torch
import torchaudio
from sklearn.model_selection import train_test_split

# === CONFIG ===
vad_audio_base = "VAD_Input/Audio"
vad_ground_dir = "VAD_Input/Ground"
sample_rate = 16000
test_ratio = 0.2
dev_ratio = 0.1  # portion of train set that goes to DEV

# Ensure directories exist
for d in [os.path.join(vad_audio_base, split) for split in ["TRAIN", "DEV", "TEST"]] + [vad_ground_dir]:
    os.makedirs(d, exist_ok=True)

# Combine dataset entries
if "all" in dataset:
    all_items = list(dataset["all"])
else:
    all_items = list(dataset["train"]) + list(dataset["test"])

# Filter valid items
valid_items = [
    item for item in all_items
    if all(k in item for k in ["id", "audio", "text", "segment_start_time", "segment_end_time"])
]

# Deduplicate by ID
unique_items = {str(item["id"]): item for item in valid_items}

# === SPLITTING ===
all_ids = list(unique_items.keys())
train_ids, test_ids = train_test_split(all_ids, test_size=test_ratio, random_state=42)
train_ids, dev_ids = train_test_split(train_ids, test_size=dev_ratio, random_state=42)

# Ensure disjoint sets
train_set = set(train_ids)
dev_set = set(dev_ids)
test_set = set(test_ids)

overlap = (train_set & dev_set) | (train_set & test_set) | (dev_set & test_set)
if overlap:
    raise ValueError(f"❌ Overlapping IDs across splits: {overlap}")

# === HELPER FUNCTION ===
def save_pair(item, split):
    split_dir = os.path.join(vad_audio_base, split.upper())
    os.makedirs(split_dir, exist_ok=True)

    item_id = str(item["id"])
    audio_path = os.path.join(split_dir, f"{item_id}.wav")
    ref_path = os.path.join(vad_ground_dir, f"{item_id}.txt")

    # Save audio
    waveform = torch.tensor(item["audio"]["array"]).unsqueeze(0)
    torchaudio.save(audio_path, waveform, sample_rate=sample_rate)

    # Save text with STT-compatible format
    with open(ref_path, "w") as f:
        f.write(f"{item['segment_start_time']}\t{item['segment_end_time']}\t{item['text']}\n")

# === WRITE TO DISK ===
for item_id in train_ids:
    save_pair(unique_items[item_id], "TRAIN")

for item_id in dev_ids:
    save_pair(unique_items[item_id], "DEV")

for item_id in test_ids:
    save_pair(unique_items[item_id], "TEST")

print(f"✅ STT-compatible VAD export complete:")
print(f"  TRAIN: {len(train_ids)}")
print(f"  DEV:   {len(dev_ids)}")
print(f"  TEST:  {len(test_ids)}")


✅ STT-compatible VAD export complete:
  TRAIN: 626
  DEV:   70
  TEST:  175
