In [6]:
import torch
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as F
from tqdm import tqdm
import os

# Load VAD model
print("Loading model VAD...")
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                    model='silero_vad',
                                    force_reload=False,
                                    trust_repo=True)
get_speech_timestamps, _, _, _, _ = utils

def rms_normalize(waveform, target_rms=0.05, eps=1e-8):
    rms = torch.sqrt(torch.mean(waveform ** 2) + eps)
    gain = target_rms / rms
    return waveform * gain

def preprocess_audio_folder(
    input_dir,
    output_dir,
    target_sample_rate=16000,
    vad_model=None,
    get_speech_timestamps=None):
    """
    Preprocess all wav files in a folder and save results
    """

    assert vad_model is not None, "vad_model is required"
    assert get_speech_timestamps is not None, "get_speech_timestamps function is required"

    input_dir = os.path.abspath(input_dir)
    os.makedirs(output_dir, exist_ok=True)

    resampler_cache = {}

    for root, _, files in os.walk(input_dir):
        for file in tqdm(files, desc="Processing audio"):
            if not file.lower().endswith(".wav"):
                continue

            in_path = os.path.join(root, file)

            rel_path = os.path.relpath(root, input_dir)
            out_dir = os.path.join(output_dir, rel_path)
            os.makedirs(out_dir, exist_ok=True)
            out_path = os.path.join(out_dir, file)

            try:
                # Load audio
                waveform, orig_sr = torchaudio.load(in_path)

                if waveform.numel() == 0:
                    continue

                # Resample
                if orig_sr != target_sample_rate:
                    if orig_sr not in resampler_cache:
                        resampler_cache[orig_sr] = T.Resample(orig_sr, target_sample_rate)
                    waveform = resampler_cache[orig_sr](waveform)

                # Mono
                if waveform.shape[0] > 1:
                    waveform = torch.mean(waveform, dim=0, keepdim=True)

                # High-pass filter (>80Hz)
                waveform = F.highpass_biquad(
                    waveform,
                    sample_rate=target_sample_rate,
                    cutoff_freq=80)

                # RMS Normalize
                waveform = rms_normalize(waveform)

                # VAD
                wav_1d = waveform.squeeze()
                speech_timestamps = get_speech_timestamps(
                    wav_1d,
                    vad_model,
                    sampling_rate=target_sample_rate)

                if len(speech_timestamps) == 0:
                    continue

                speech_segments = [
                    wav_1d[ts["start"]:ts["end"]]
                    for ts in speech_timestamps
                    if ts["end"] > ts["start"]
                ]

                if len(speech_segments) == 0:
                    continue

                clean_waveform = torch.cat(speech_segments).unsqueeze(0)
                clean_waveform = clean_waveform.clamp(-1.0, 1.0)

                torchaudio.save(out_path, clean_waveform, target_sample_rate)

            except Exception as e:
                print(f"❌ Error {in_path}: {e}")

    print(f"✅ Done → {output_dir}")

Loading model VAD...


Using cache found in C:\Users\PC1/.cache\torch\hub\snakers4_silero-vad_master


### Preprocessing for VSASV

In [7]:
preprocess_audio_folder(
    input_dir=r"E:\speech_data\wav\VSASV",
    output_dir=r"E:\speech_data\clean_wav\VSASV",
    target_sample_rate=16000,
    vad_model=vad_model,
    get_speech_timestamps=get_speech_timestamps)

Processing audio: 0it [00:00, ?it/s]
Processing audio: 100%|██████████| 25/25 [00:01<00:00, 18.16it/s]
Processing audio: 100%|██████████| 63/63 [00:03<00:00, 18.01it/s]
Processing audio: 100%|██████████| 47/47 [00:02<00:00, 18.43it/s]
Processing audio: 100%|██████████| 95/95 [00:06<00:00, 15.62it/s]
Processing audio: 100%|██████████| 57/57 [00:03<00:00, 18.05it/s]
Processing audio: 100%|██████████| 20/20 [00:00<00:00, 32.15it/s]
Processing audio: 100%|██████████| 50/50 [00:01<00:00, 32.49it/s]
Processing audio: 100%|██████████| 10/10 [00:00<00:00, 10.88it/s]
Processing audio: 100%|██████████| 15/15 [00:00<00:00, 20.00it/s]
Processing audio: 100%|██████████| 17/17 [00:00<00:00, 17.63it/s]
Processing audio: 100%|██████████| 13/13 [00:00<00:00, 27.68it/s]
Processing audio: 100%|██████████| 15/15 [00:00<00:00, 18.52it/s]
Processing audio: 100%|██████████| 100/100 [00:05<00:00, 19.84it/s]
Processing audio: 100%|██████████| 63/63 [00:04<00:00, 14.32it/s]
Processing audio: 100%|██████████| 87

✅ Done → E:\speech_data\clean_wav\VSASV





### Preprocessing for VoxVietnam

In [8]:
preprocess_audio_folder(
    input_dir=r"E:\speech_data\wav\Vox_train",
    output_dir=r"E:\speech_data\clean_wav\Vox_train",
    target_sample_rate=16000,
    vad_model=vad_model,
    get_speech_timestamps=get_speech_timestamps)

Processing audio: 0it [00:00, ?it/s]
Processing audio: 100%|██████████| 191/191 [00:47<00:00,  4.04it/s]
Processing audio: 100%|██████████| 149/149 [00:39<00:00,  3.80it/s]
Processing audio: 100%|██████████| 90/90 [00:24<00:00,  3.65it/s]
Processing audio: 100%|██████████| 58/58 [00:13<00:00,  4.14it/s]
Processing audio: 100%|██████████| 63/63 [00:17<00:00,  3.52it/s]
Processing audio: 100%|██████████| 58/58 [00:17<00:00,  3.39it/s]
Processing audio: 100%|██████████| 35/35 [00:10<00:00,  3.43it/s]
Processing audio: 100%|██████████| 43/43 [00:11<00:00,  3.72it/s]
Processing audio: 100%|██████████| 36/36 [00:11<00:00,  3.20it/s]
Processing audio: 100%|██████████| 37/37 [00:11<00:00,  3.11it/s]
Processing audio: 100%|██████████| 23/23 [00:06<00:00,  3.72it/s]
Processing audio: 100%|██████████| 16/16 [00:03<00:00,  4.10it/s]
Processing audio: 100%|██████████| 13/13 [00:03<00:00,  3.32it/s]
Processing audio: 100%|██████████| 17/17 [00:04<00:00,  4.02it/s]
Processing audio: 100%|██████████| 

✅ Done → E:\speech_data\clean_wav\Vox_train





### Merge

In [34]:
import shutil
import re

VSASV_DIR = r"E:\speech_data\clean_wav\VSASV"
VOX_DIR   = r"E:\speech_data\clean_wav\Vox_train"
OUT_DIR   = r"E:\speech_data\train"

os.makedirs(OUT_DIR, exist_ok=True)

def get_next_speaker_id(existing_ids):
    if not existing_ids:
        return 0
    return max(existing_ids) + 1

def parse_id(id_str):
    return int(id_str.replace("id", ""))

speaker_map = {}        # old_id -> new_id
used_speaker_ids = set()

def register_speaker(old_id):
    if old_id in speaker_map:
        return speaker_map[old_id]

    sid = parse_id(old_id)
    if sid in used_speaker_ids:
        sid = get_next_speaker_id(used_speaker_ids)

    used_speaker_ids.add(sid)
    speaker_map[old_id] = sid
    return sid

In [35]:
pattern = re.compile(r"00204[_-]?(\d+)")

for speaker in sorted(os.listdir(VOX_DIR)):
    speaker_path = os.path.join(VOX_DIR, speaker)
    if not os.path.isdir(speaker_path):
        continue

    new_sid = register_speaker(speaker)

    for wav in sorted(os.listdir(speaker_path)):
        if not wav.lower().endswith(".wav"):
            continue

        m = pattern.search(wav)
        if not m:
            print(f"No utt_id found: {wav}")
            continue

        utt_id = int(m.group(1))   # utt id after 00204

        new_name = f"id{new_sid:05d}_{utt_id:05d}.wav"

        src = os.path.join(speaker_path, wav)
        dst = os.path.join(OUT_DIR, new_name)

        if os.path.exists(dst):
            print(f"Duplicate files (overwrite risk): {dst}")

        shutil.copy2(src, dst)

Duplicate files (overwrite risk): E:\speech_data\train\id00065_00449.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00450.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00451.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00452.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00453.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00454.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00455.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00456.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00457.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00458.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00459.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00460.wav
Duplicate files (overwrite risk): E:\speech_data\train\id00065_00461.wav
Duplicate files (overwrite risk): E:\speech_data\tr

KeyboardInterrupt: 

In [None]:
# VSASV
for speaker in sorted(os.listdir(VSASV_DIR)):
    speaker_path = os.path.join(VSASV_DIR, speaker)
    if not os.path.isdir(speaker_path):
        continue

    new_sid = register_speaker(speaker)

    for wav in sorted(os.listdir(speaker_path)):
        if not wav.endswith(".wav"):
            continue

        utt_id = os.path.splitext(wav)[0]
        new_name = f"id{new_sid:05d}_{int(utt_id):05d}.wav"

        src = os.path.join(speaker_path, wav)
        dst = os.path.join(OUT_DIR, new_name)
        
        if os.path.exists(dst):
            print(f"Duplicate files (overwrite risk): {dst}")

        shutil.copy2(src, dst)