In [1]:
# Setup
!pip install -q numpy pandas transformers datasets[audio] accelerate torchaudio jiwer

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Load Dataset
import os
from datasets import Dataset, DatasetDict, Audio

def load_vivos_dataset(base_path):
    all_samples = []
    for split in ["train", "test"]:
        wav_dir = os.path.join(base_path, split, "waves")
        prompt_file = os.path.join(base_path, split, "prompts.txt")
        with open(prompt_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                file_id, text = line.split(" ", 1)
                speaker = file_id.split("_")[0]
                wav_path = os.path.join(wav_dir, speaker, file_id + ".wav")
                if os.path.exists(wav_path):
                    all_samples.append({"audio": wav_path, "transcription": text})
    return DatasetDict({"train": Dataset.from_list(all_samples)})

base_path = "/kaggle/input/vivos-vietnamese/vivos"
dataset = load_vivos_dataset(base_path)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [3]:
# Preprocess
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

def prepare_batch(batch):
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=16000, return_tensors="pt", padding=True)

    # Lower case text
    clean_text = batch["transcription"].lower().strip()
    # Tokenize label riêng
    labels = processor(text=clean_text, return_tensors="pt", padding=True)

    batch["input_values"] = inputs.input_values[0]
    batch["attention_mask"] = inputs.get("attention_mask", None)
    batch["labels"] = labels.input_ids[0]
    return batch

dataset = dataset.map(prepare_batch, remove_columns=["audio"])

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/12420 [00:00<?, ? examples/s]

2025-05-25 07:11:41.316668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748157101.480936      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748157101.531247      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
print(dataset["train"][2]["transcription"])
print(dataset["train"][2]["labels"])

TRONG SỐ CÁC QUỐC GIA CÔNG NGHIỆP PHÁT TRIỂN
[56, 47, 105, 26, 58, 46, 39, 23, 46, 49, 62, 49, 46, 20, 31, 23, 49, 46, 58, 57, 17, 46, 49, 30, 26, 58, 46, 26, 58, 71, 57, 37, 98, 46, 98, 71, 62, 56, 46, 56, 47, 57, 45, 26]


In [5]:
print(dataset["train"][2]["attention_mask"])

None


In [6]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [7]:
# Train with Accelerate
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

training_args = TrainingArguments(
    output_dir="./wav2vec2-vivos-ft",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    fp16=True,
    num_train_epochs=10,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
model.freeze_feature_encoder()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=processor,
    data_collator=data_collator
)

config.json:   0%|          | 0.00/1.65k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

  trainer = Trainer(


In [8]:
config = """
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
downcast_bf16: 'no'
fp16: true
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
num_machines: 1
num_processes: 2
use_cpu: false
"""

import os
os.makedirs("/root/.cache/huggingface/accelerate", exist_ok=True)
with open("/root/.cache/huggingface/accelerate/default_config.yaml", "w") as f:
    f.write(config.strip())

In [9]:
# Train
trainer.train()



Step,Training Loss
50,0.2638
100,0.2466
150,0.2799
200,0.2374
250,0.219
300,0.2339
350,0.2399
400,0.2326
450,0.2222
500,0.2536




TrainOutput(global_step=7770, training_loss=0.18133114844829112, metrics={'train_runtime': 20133.0136, 'train_samples_per_second': 6.169, 'train_steps_per_second': 0.386, 'total_flos': 1.1122109724923132e+19, 'train_loss': 0.18133114844829112, 'epoch': 10.0})

In [10]:
# Save
model.save_pretrained("/kaggle/working/wav2vec2-vivos-finetune")
processor.save_pretrained("/kaggle/working/wav2vec2-vivos-finetune")

[]

In [11]:
import shutil
shutil.make_archive("/kaggle/working/wav2vec2-vivos-finetune", 'zip', "/kaggle/working/wav2vec2-vivos-finetune")

'/kaggle/working/wav2vec2-vivos-finetune.zip'