In [1]:
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments,
                          Trainer)
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Đọc DataFrame của bạn (nếu chưa có)
df_T6 = pd.read_csv("/home/jovyan/ai-core/speech_to_text/data/ghtk_T6/final_label.csv", delimiter=";")
df_T9 = pd.read_csv("/home/jovyan/ai-core/speech_to_text/data/ghtk_T9/final_label.csv", delimiter=";")

df_train = pd.concat([df_T6, df_T9], axis=0, ignore_index=True)
df_train.drop(["start", "end"], axis =1, inplace=True)

df_train.rename(columns={"folder": "audio", "text": "labeled_text"}, inplace=True)

df_test = pd.read_excel("final_label.xlsx")

train_dataset = Dataset.from_pandas(df_train.reset_index(drop=True))
test_dataset = Dataset.from_pandas(df_test.reset_index(drop=True))

In [3]:
train_dataset, test_dataset

(Dataset({
     features: ['audio', 'labeled_text'],
     num_rows: 17730
 }),
 Dataset({
     features: ['audio', 'labeled_text'],
     num_rows: 3864
 }))

In [4]:
# Tải processor
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

# Định nghĩa tập hợp các ký tự cần loại bỏ
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"“%\‘\’\']'

def remove_special_characters(batch):
    batch["labeled_text"] = batch["labeled_text"].lower()
    batch["labeled_text"] = re.sub(chars_to_ignore_regex, '', batch["labeled_text"])
    return batch

def speech_file_to_array(batch):
    speech_array, sampling_rate = librosa.load(batch["audio"], sr=16000)
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    return batch

def prepare_dataset(batch):
    # Xử lý âm thanh
    batch = speech_file_to_array(batch)
    # Chuẩn bị input_values
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values[0]
    # Chuẩn bị labels
    with processor.as_target_processor():
        batch["labels"] = processor(batch["labeled_text"]).input_ids
    return batch

# Áp dụng tiền xử lý
train_dataset = train_dataset.map(remove_special_characters)
test_dataset = test_dataset.map(remove_special_characters)

train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
                                                                  

In [5]:
import dataclasses
from typing import Any, Dict, List, Union

@dataclasses.dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )
        
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        # Thay thế giá trị padding trong labels bằng -100 để bỏ qua trong quá trình tính loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [6]:
model = Wav2Vec2ForCTC.from_pretrained(
    "nguyenvulebinh/wav2vec2-base-vietnamese-250h",
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

# Đảm bảo mô hình sử dụng đúng số lượng lớp âm vị (vocab size)
model.config.vocab_size = len(processor.tokenizer)

# Định nghĩa tham số huấn luyện
training_args = TrainingArguments(
    output_dir="model/wav2vec2-finetuned-vietnamese",
    group_by_length=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=5,
    fp16=torch.cuda.is_available(),  # Sử dụng FP16 nếu có GPU hỗ trợ
    save_steps=2000,
    eval_steps=2000,
    logging_steps=100,
    learning_rate=5e-6,
    warmup_steps=500,
    save_total_limit=5,
)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [7]:
from jiwer import wer as wer_metric


# -

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred_str = processor.batch_decode(pred_ids)
    # Xóa các ký tự đặc biệt trong văn bản dự đoán
    pred_str = [re.sub(chars_to_ignore_regex, '', s).lower() for s in pred_str]

    # Xử lý labels
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    label_str = [re.sub(chars_to_ignore_regex, '', s).lower() for s in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}



In [8]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
)

trainer.train()

KeyboardInterrupt: 