In [1]:
# Setup
!pip install -q \
git+https://github.com/huggingface/transformers.git \
datasets==2.19.0 \
soundfile==0.13.1 \
librosa==0.9.2 \
jiwer==3.0.3 \
git+https://github.com/huggingface/accelerate.git \
numpy==1.26.4 \
pandas \
tqdm

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.3/214.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Imports & Paths
import os, re, json, random, soundfile as sf, pandas as pd, numpy as np
from datasets import load_dataset, Dataset, Audio
from pathlib import Path
from tqdm import tqdm

# Đường dẫn đầu vào trên Kaggle
VIVOS_ROOT = '/kaggle/input/vivos-vietnamese/vivos'
CV_ROOT    = '/kaggle/input/common-voice-vi-21/vi'
NOISE_ROOT = '/kaggle/input/musan-noise/musan/noise'

# Thư mục xuất manifest
OUTPUT_DIR = '/kaggle/working/manifests'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
import re
def normalize_text(text: str) -> str:
    # Đưa về chữ thường
    text = text.lower()

    # Loại bỏ dấu câu và ký tự không phải chữ cái, số, khoảng trắng
    text = re.sub(r"[^\w\sàáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ]", "", text)

    # Xoá khoảng trắng thừa
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [4]:
# Thu thập VIVOS
def collect_vivos(vivos_root: str):
    items = []
    prompts = Path(vivos_root, "train", "prompts.txt")
    if prompts.exists():
        trans = {l.split()[0]: normalize_text(" ".join(l.split()[1:])) for l in prompts.read_text(encoding="utf8").splitlines()}
        for wav_path in Path(vivos_root, "train", "waves").rglob("*.wav"):
            utt_id = wav_path.stem
            if utt_id in trans:
                items.append({"audio_filepath": str(wav_path), "text": trans[utt_id]})
    return items

vivos_items = collect_vivos(VIVOS_ROOT)
print(f"Collected {len(vivos_items)} samples from VIVOS (train)")

Collected 11660 samples from VIVOS (train)


In [5]:
print(vivos_items[0])

{'audio_filepath': '/kaggle/input/vivos-vietnamese/vivos/train/waves/VIVOSSPK36/VIVOSSPK36_218.wav', 'text': 'hãy chọn phản ứng hóm hỉnh để chứng tỏ rằng bạn không bị lừa mà không làm căng thẳng tình hình'}


In [6]:
# Thu thập Common Voice Vi
def collect_common_voice(cv_root: str, min_dur=1.0, max_dur=15.0):
    # Đọc metadata
    meta = pd.read_csv(os.path.join(cv_root, 'validated.tsv'), sep='\t', keep_default_na=False)
    durs = pd.read_csv(os.path.join(cv_root, 'clip_durations.tsv'), sep='\t')

    # Đổi tên cột để khớp với expected logic
    durs.rename(columns={'clip':'path', 'duration[ms]': 'duration'}, inplace=True)
    durs['duration'] = durs['duration'] / 1000  # ms → s
    
    # Ghép và lọc theo thời lượng
    df = meta.merge(durs, on='path')
    df = df[(df['duration'] >= min_dur) & (df['duration'] <= max_dur)]

    # Tạo đường dẫn audio
    df['audio_filepath'] = df['path'].apply(lambda p: str(Path(cv_root, "clips", p)))
    df = df.rename(columns={'sentence': 'text'})

    df['text'] = df['text'].apply(normalize_text)
    return df[['audio_filepath', 'text']].to_dict(orient='records')

cv_items = collect_common_voice(CV_ROOT)
print(f"Collected {len(cv_items)} samples from Common Voice VI")

Collected 5191 samples from Common Voice VI


In [7]:
print(cv_items[0])

{'audio_filepath': '/kaggle/input/common-voice-vi-21/vi/clips/common_voice_vi_30580094.mp3', 'text': 'ạ dạ không ạ ngại quá'}


In [8]:
# Gộp & Chia Train/Dev
all_items = vivos_items + cv_items
print(f"Total before split: {len(all_items)} samples")

random.seed(42)
random.shuffle(all_items)

dev_size = int(len(all_items) * 0.10)
dev_items = all_items[:dev_size]
train_items = all_items[dev_size:]
print(f"Train: {len(train_items)} samples, Dev: {len(dev_items)} samples")

train_manifest = os.path.join(OUTPUT_DIR, 'train_manifest.jsonl')
dev_manifest   = os.path.join(OUTPUT_DIR, 'dev_manifest.jsonl')
with open(train_manifest, 'w', encoding='utf-8') as f:
    for item in train_items:
        f.write(json.dumps({'audio_filepath': item['audio_filepath'], 'text': item['text']}, ensure_ascii=False) + '\n')
with open(dev_manifest, 'w', encoding='utf-8') as f:
    for item in dev_items:
        f.write(json.dumps({'audio_filepath': item['audio_filepath'], 'text': item['text']}, ensure_ascii=False) + '\n')

print("Manifests created:")
print(" ", train_manifest)
print(" ", dev_manifest)

Total before split: 16851 samples
Train: 15166 samples, Dev: 1685 samples
Manifests created:
  /kaggle/working/manifests/train_manifest.jsonl
  /kaggle/working/manifests/dev_manifest.jsonl


In [9]:
# Tạo noise manifest từ MUSAN
noise_records = []
for root, _, files in os.walk(NOISE_ROOT):
    for fn in files:
        if fn.endswith('.wav'):
            noise_records.append({'audio_filepath': os.path.join(root, fn)})
noise_manifest = os.path.join(OUTPUT_DIR, 'noise_musan.jsonl')
with open(noise_manifest, 'w') as f:
    for rec in noise_records:
        f.write(json.dumps(rec) + '\n')
print(f"Noise manifest: {noise_manifest}, {len(noise_records)} files from 'noise' folder")

Noise manifest: /kaggle/working/manifests/noise_musan.jsonl, 930 files from 'noise' folder


In [10]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer, EarlyStoppingCallback, TrainerCallback
import torchaudio, torch
from jiwer import wer

# Dường dẫn manifest và noise
train_manifest = os.path.join(OUTPUT_DIR, 'train_manifest.jsonl')
dev_manifest   = os.path.join(OUTPUT_DIR, 'dev_manifest.jsonl')
NOISE_DIR = Path(NOISE_ROOT)

# Load processor & model
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model.gradient_checkpointing_enable()
for p in model.wav2vec2.feature_extractor.parameters():
    p.requires_grad = False

# Load dataset
data_files = {"train": train_manifest, "validation": dev_manifest}
datasets = load_dataset("json", data_files=data_files)
datasets = datasets.cast_column("audio_filepath", Audio(sampling_rate=16000))

2025-05-31 06:21:45.812563: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748672506.200028      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748672506.318292      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.65k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

In [11]:
sample = datasets["train"][0]
print(sample["audio_filepath"])
print(sample["audio_filepath"]["array"].shape)
print(sample["text"])

{'path': '/kaggle/input/vivos-vietnamese/vivos/train/waves/VIVOSSPK26/VIVOSSPK26_019.wav', 'array': array([0.        , 0.        , 0.        , ..., 0.00021362, 0.00027466,
       0.        ]), 'sampling_rate': 16000}
(118000,)
ký sự hồi ký của tô hoài cũng được nhiều người đọc vì văn phong trong sáng hấp dẫn


In [12]:
sample = datasets["train"][-1]
print(sample["audio_filepath"])
print(sample["audio_filepath"]["array"].shape)
print(sample["text"])

{'path': '/kaggle/input/vivos-vietnamese/vivos/train/waves/VIVOSSPK12/VIVOSSPK12_R072.wav', 'array': array([ 0.00000000e+00,  0.00000000e+00, -3.05175781e-05, ...,
       -3.66210938e-03, -3.69262695e-03, -3.44848633e-03]), 'sampling_rate': 16000}
(52000,)
nó khiến bạn trở nên tự tin hơn


In [13]:
len(datasets['train'])

15166

In [14]:

import torchaudio.functional as F

def speed_perturb(wave: torch.Tensor, sr: int) -> torch.Tensor:
    sp = random.choice([0.9, 1.0, 1.1])
    if sp == 1.0:
        return wave
    new_sr = int(sr * sp)
    return F.resample(wave, orig_freq=sr, new_freq=new_sr)

def add_noise(wave, target_snr):
    noise_path = random.choice(noise_records)["audio_filepath"]
    n_wave, _ = torchaudio.load(noise_path)
    n_wave = n_wave.mean(0)
    
    wave = wave.squeeze()
    if len(n_wave) < len(wave):
        repeat = (len(wave) // len(n_wave)) + 1
        n_wave = n_wave.repeat(repeat)
    n_wave = n_wave[:len(wave)]
    
    if n_wave.shape != wave.shape:
        raise ValueError(f"Shape mismatch: wave {wave.shape}, noise {n_wave.shape}")

    rms_s = wave.pow(2).mean().sqrt()
    rms_n = n_wave.pow(2).mean().sqrt()
    gain = rms_s / (rms_n * 10 ** (target_snr / 20))
    return (wave + gain * n_wave).clamp(-1, 1)

def prepare(batch):
    audio = batch["audio_filepath"]["array"]
    sr    = batch["audio_filepath"]["sampling_rate"]
    
    # Speed Perturbation
    audio = torch.tensor(audio)
        
    # Noise Augmentation
    if random.random() < 0.25:
        snr = random.choice([6,9,12,15])
        audio = add_noise(audio, snr)
        
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    labels = processor(text = batch["text"], return_tensors="pt")

    batch["input_values"] = inputs.input_values[0].squeeze()
    batch["attention_mask"] = inputs.get("attention_mask", None)
    batch["labels"]      = labels.input_ids[0].squeeze()
    return batch

processed_datasets = datasets.map(prepare, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/15166 [00:00<?, ? examples/s]

Map:   0%|          | 0/1685 [00:00<?, ? examples/s]

In [15]:
print(processed_datasets["train"][0]["labels"])

[96, 6, 46, 39, 19, 46, 71, 75, 57, 46, 96, 6, 46, 49, 28, 17, 46, 56, 30, 46, 71, 105, 24, 57, 46, 49, 48, 26, 58, 46, 89, 106, 69, 49, 46, 26, 71, 57, 9, 31, 46, 26, 58, 106, 21, 57, 46, 89, 74, 49, 46, 15, 40, 46, 15, 104, 26, 46, 98, 71, 105, 26, 58, 46, 56, 47, 105, 26, 58, 46, 39, 62, 26, 58, 46, 71, 99, 98, 46, 44, 97, 26]


In [16]:
print(processed_datasets["train"][-1]["labels"])

[26, 82, 46, 96, 71, 57, 52, 26, 46, 36, 50, 26, 46, 56, 47, 8, 46, 26, 12, 26, 46, 56, 19, 46, 56, 57, 26, 46, 71, 95, 26]


In [17]:
len(processed_datasets['train'])

15166

In [18]:
# Cấu hình SpecAugment
model.config.update({
    'mask_time_prob': 0.08,   # 8 %
    'mask_time_length': 10,
    'mask_feature_prob': 0.05,   # 5 %
    'mask_feature_length': 64,
    'num_time_masks': 2,
    'num_feature_masks': 2
})
print("SpecAug settings:", model.config.mask_time_prob, model.config.mask_feature_prob)

SpecAug settings: 0.08 0.05


In [19]:
from dataclasses import dataclass
from typing import Any

@dataclass
class DataCollatorCTC:
    processor: Any

    def __call__(self, features):
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(input_features, return_tensors="pt", padding=True)

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt", padding=True)

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

collator = DataCollatorCTC(processor=processor)

In [20]:
batch = collator([processed_datasets["train"][i] for i in range(4)])
print(batch.keys())
print(batch["input_values"].shape)
print(batch["labels"].shape)

KeysView({'input_values': tensor([[ 9.1767e-09,  9.1767e-09,  9.1767e-09,  ...,  1.8950e-03,
          2.4364e-03,  9.1767e-09],
        [-8.6768e-04, -8.6768e-04, -8.6768e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-7.2962e-03, -1.4802e-02, -1.0632e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.1630e-01, -8.4196e-02,  7.4939e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'labels': tensor([[  96,    6,   46,   39,   19,   46,   71,   75,   57,   46,   96,    6,
           46,   49,   28,   17,   46,   56,   30,   46,   71,  105,   24,   57,
           46,   49,   48,   26,   58,   46,   89,  106,   69,   49,   46,   26,
           71,   57,    9,   31,   46,   26,   58,  106,   21,   57,   46,   89,
           74,   49,   46,   15,   40,   46,   15,  104,   26,   46,   98,   71,
          105,   26,   58,   46,   56,   47,  105,   26,   58,   46,   39,   62,
           26,   58,   46,   71,   99,   98,   46,   44,   97,  

In [21]:
class UnfreezeCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 6000:
            for p in model.wav2vec2.feature_extractor.parameters():
                p.requires_grad = True
            print(">>> Unfroze feature extractor at step 6000")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = [s.strip() for s in processor.batch_decode(pred_ids)]
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = [s.strip() for s in processor.batch_decode(label_ids, group_tokens=False)]
    return {"wer": wer(label_str, pred_str)}

In [22]:
# Training Arguments

args = TrainingArguments(
    output_dir="./wav2vec2_vi_ft",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    num_train_epochs=20,
    learning_rate=1e-5,
    warmup_steps=800,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4), UnfreezeCallback()]
)

In [23]:
# Train
trainer.train()



Step,Training Loss,Validation Loss,Wer
100,0.3764,0.197931,0.181102
200,691.7237,,1.0
300,0.0,,1.0
400,0.0,,1.0
500,0.0,,1.0




TrainOutput(global_step=500, training_loss=69.28197633361816, metrics={'train_runtime': 6613.1845, 'train_samples_per_second': 45.866, 'train_steps_per_second': 0.717, 'total_flos': 2.710740714233985e+18, 'train_loss': 69.28197633361816, 'epoch': 2.109704641350211})

In [24]:
# Save
model.save_pretrained("/kaggle/working/wav2vec2_vi_ft")
processor.save_pretrained("/kaggle/working/wav2vec2_vi_ft")

[]

In [25]:
import shutil
shutil.make_archive("/kaggle/working/wav2vec2_vi_ft", 'zip', "/kaggle/working/wav2vec2_vi_ft")

'/kaggle/working/wav2vec2_vi_ft.zip'