This notebook should be executed on a GPU runtime.

In [None]:
# Download nessecary libraries
!pip install datasets --quiet
!pip install transformers[torch] --quiet
!pip install --upgrade accelerate --quiet
!pip install evaluate --quiet
!pip install jiwer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Load the preprocessed dataet
from datasets import DatasetDict

ds = DatasetDict.load_from_disk("./drive/MyDrive/ASR_Colab/dataset.hf")

# Remove unnessecary columns for network traning to save RAM
model_columns = ['input_values', 'input_length', 'labels']
ds['train'] = ds['train'].remove_columns([col for col in ds['train'].column_names if col not in model_columns])
ds['test'] = ds['test'].remove_columns([col for col in ds['test'].column_names if col not in model_columns])

To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [None]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./drive/MyDrive/ASR_Colab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
# show a row of our processed dataset and how they are encoded
from IPython.display import Audio

example = ds['train'][0]

print(example["labels"])
print(example["input_values"][0:10])

print(tokenizer.decode(example["labels"]))
Audio(example["input_values"], rate=16000)

[51, 53, 27, 11, 27, 51, 54, 11, 61, 96, 65, 51, 54, 41, 27, 54, 105, 51, 96, 54, 10, 27, 51, 65, 18, 85, 51]
[-0.00022279308177530766, -0.00036427180748432875, -0.00019891963165719062, -0.00048047080053947866, -0.0006540930480696261, -0.0004578224616125226, -0.00036907047615386546, -8.947730384534225e-05, -8.902730041882023e-05, -0.00034265706199221313]
دلیل اینکه اولاً کامل هست


In [None]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Data collator pads each of the voices and transcripts for them have a same size
# lets see what collator does to some random rows:
example_features = [
    ds["train"][0],
    ds["train"][1],
    ds["train"][20],
]
example_batch = data_collator(example_features)
{k:v.shape for k,v in example_batch.items()}



{'input_values': torch.Size([3, 101797]),
 'attention_mask': torch.Size([3, 101797]),
 'labels': torch.Size([3, 50])}

In [None]:
# feel free to adjust these parameters if training doesn't converge
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=processor.tokenizer.vocab_size
)
model.freeze_feature_encoder()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Change the following parameters if you deem nessecary, specially if you run out of GPU RAM or don't reach the desired performance
from transformers import TrainingArguments

save_dir = 'wav2vec_cache'
training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=False,
    per_device_train_batch_size=1,  # using small batch size to prevent GPU running out of memory
    gradient_accumulation_steps=10, # using mini-batches to prevent GPU running out of memory
    evaluation_strategy="steps",
    num_train_epochs=4,             # increasing number of epochs can increase model performance, but takes longer time to train
    fp16=True,
    save_steps=60,
    eval_steps=20,
    logging_steps=20,
    learning_rate=3e-4,
    warmup_steps=20,
    save_total_limit=1,
    load_best_model_at_end = True
)

In [None]:
# Load wer(Word Error Rate) metric
import evaluate

wer_metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer
import numpy as np

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
# save model weights for the evaluation step
trainer.save_model("./drive/MyDrive/ASR_Colab/model_weights")