This notebook is inspired from a very good [HuggingFace Tutorial](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb#scrollTo=bTjNp2KUYAl8)

# pip install

In [None]:
!pip install phonemizer
!apt-get install espeak
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa
!pip install jiwer

# notebook

In [None]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-a13a7848-cd36-d106-2117-85c6688bf2cb)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import libraries
from datasets import load_dataset, load_metric, ClassLabel, load_from_disk
import datasets
datasets.set_caching_enabled(False)

import torch

from dataclasses import dataclass, field

from typing import Any, Dict, List, Optional, Union

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AdamW, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup

from torch.utils.data.dataloader import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

import random
import math
import pandas as pd
import numpy as np

from IPython.display import display, HTML

import re
import json
import os
from tqdm.notebook import tqdm

from utils import *
from trainer import Trainer

# Vis

In [None]:
# Visualisation
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

# Load & Preprocess Dataset
IF YOU DON'T HAVE ALREADY THE DATASET PREPROCESSED CONTINUE, OTHERWISE SKIP THIS SECTION

## Download/Load

First we are going to choose one language (you can look on https://huggingface.co/datasets/common_voice for other code's languages)

In [None]:
code_lang = "cs" # You can change if you want another language from the common voice dataset

For this experience, we chose Czech. \
Let's download the dataset.

In [None]:
common_voice = load_dataset("common_voice", "cs", data_dir="./cv-corpus-6.1-2020-12-11", split="train+validation")
common_voice_test = load_dataset("common_voice", "cs", data_dir="./cv-corpus-6.1-2020-12-11", split="test")



If you are going to use only audio & transcription, you can remove the other columns.

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

## Preprocess
We are going to preprocess the dataset

In [None]:
show_random_elements(common_voice.remove_columns(['path']), num_examples=20)

Now, it depends on what we want to evaluate, if we want to evaluate the phonem transcription or the word transcription.

### Word Transcription

We are going to preprocess the text and remove some special symbol `,.?!;` as we don't have any language model at the output

In [None]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\—\…\–\«\»]' # You can modify or add things here
common_voice = common_voice.map(remove_special_characters, remove_columns=["sentence"])
common_voice_test = common_voice_test.map(remove_special_characters, remove_columns=["sentence"])

In [None]:
show_random_elements(common_voice.remove_columns(['path']), num_examples=20)

### Phonem Transcription

For phonem transcription we need first to convert the text to phonemes.

In [None]:
common_voice = common_voice.map(text2phoneme, num_proc=4)
common_voice_test = common_voice_test.map(text2phoneme, num_proc=4)
common_voice = common_voice.rename_column("sentence", "text")
common_voice_test = common_voice_test.rename_column("sentence", "text")

In [None]:
show_random_elements(common_voice.remove_columns(['path']), num_examples=20)

## Building Vocabulary

As we are going to use a CTC (as top layer), we are going to classify speech chunks into letters, so now we will extract all distinct letters and build our vocabulary from that.

In [None]:
vocab_train = common_voice.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Now we will create the union of all distinct letters from both dataset. We will do the same thing as when we are dealing with translation / generation task.

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
# Adding the blank token, the unknown token and the padding token
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(f"Our final layer will have as output dimension {len(vocab_dict)}")

In [None]:
# Now let's save our dictionary
parent_dir = "..." # Here you have to put where you want to save the vocabulary
with open(os.path.join(parent_dir, 'czeck_phonem_vocab.json'), 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Audio Preprocessing

Now we are going to open and store the audio file (represented as a numpy array)

In [None]:
common_voice = common_voice.map(speech_file_to_array_fn, remove_columns=common_voice.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

### Resample
**If you dataset is already sampled to 16kHZ, skip this step** \

Wav2Vec2 (XLSR or English Only) was pretrained on the audio data of Babel, Multilingual LibriSpeech (MLS), and Common Voice. Most of those datasets were sampled at 16kHz, so that Common Voice, sampled at 48kHz, has to be downsampled to 16kHz for training. Therefore, we will have to downsample our fine-tuning data to 16kHz in the following.

In [None]:
# First we have to downsampled the original sample from 48 kHZ to 16kHZ
common_voice = common_voice.map(resample, num_proc=4)
common_voice_test = common_voice_test.map(resample, num_proc=4)

# Train Dev Test
Now we are going to split our data intro three subsets. Fortunately, the common voice already provide us with these subset. \
Nevertheless if you want to do your own split, you can follow these steps (note that I will not modify the test set as it is already given by CV, so it's better to keep the same testing set in order to have a fair and good comparison)

In [None]:
# Split into train/dev
np.random.seed(42)
data = common_voice.train_test_split(test_size=0.2, seed=42)
common_voice_train, common_voice_validation = data['train'], data['test']

Now if you want to make experimentation (as I did) and see how these pretrained models performs with few labeled data, you can split the train into different subsets (10mn, 1h, 10h for instance)

In [None]:
# Now let's shuffle data
common_voice_train = common_voice_train.shuffle(seed=42)

In [None]:
total_len_seconds = 0
indices_10mn = []
indices_1h = []
indices_10h = []
for i in tqdm(range(len(common_voice_train))):
  speech_array, sampling_rate = common_voice_train[i]["speech"], common_voice_train[i]["sampling_rate"]
  duration_audio = len(speech_array) * (1/sampling_rate)
  if total_len_seconds <= 600: # 600 => 10 minutes
    indices_10mn.append(i)
  if total_len_seconds <= 3600: # 3600 => 60 minutes => 1 heure
    indices_1h.append(i)
  if total_len_seconds <= 36000: # 36000 => 600 minutes => 10 heures
    indices_10h.append(i)
  if total_len_seconds > 36000:
    break
  total_len_seconds += duration_audio  

In [None]:
common_voice_train_10mn = common_voice_train.select(indices_10mn)
common_voice_train_1h = common_voice_train.select(indices_1h)
common_voice_train_10h = common_voice_train.select(indices_10h)

In [None]:
common_voice_train_10mn = common_voice_train_10mn.map(prepare_dataset, remove_columns=common_voice_train_10mn.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train_1h = common_voice_train_1h.map(prepare_dataset, remove_columns=common_voice_train_1h.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train_10h = common_voice_train_10h.map(prepare_dataset, remove_columns=common_voice_train_10h.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_validation = common_voice_validation.map(prepare_dataset, remove_columns=common_voice_validation.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

We can to disk the data ... but **you will need extra memory as the files are huge**

In [None]:
common_voice_train_10mn.save_to_disk(os.path.join(parent_dir, 'train_czeck_phonem_10mn.files')
common_voice_train_1h.save_to_disk(os.path.join(parent_dir, 'train_czeck_phonem_1h.files'))
common_voice_train_10h.save_to_disk(os.path.join(parent_dir, 'train_czeck_phonem_10h.files'))
common_voice_train.save_to_disk(os.path.join(parent_dir, 'train_czeck_phonem.files'))
common_voice_validation.save_to_disk(os.path.join(parent_dir, 'validation_czeck_phonem.files'))
common_voice_test.save_to_disk(os.path.join(parent_dir, 'test_czeck_phonem.files'))

# Training

In [None]:
parent_dir = '...' # Your path

In [None]:
common_voice_train_10mn = load_from_disk(os.path.join(parent_dir, 'train_czeck_phonem_10mn.files')
common_voice_train_1h = load_from_disk(os.path.join(parent_dir, 'train_czeck_phonem_1h.files'))
common_voice_train_10h = load_from_disk(os.path.join(parent_dir, 'train_czeck_phonem_10h.files'))
common_voice_train = load_from_disk(os.path.join(parent_dir, 'train_czeck_phonem.files'))
common_voice_validation = load_from_disk(os.path.join(parent_dir, 'validation_czeck_phonem.files'))
common_voice_test = load_from_disk(os.path.join(parent_dir, 'test_czeck_phonem.files'))

In [None]:
# Loading tokenizer
tokenizer = Wav2Vec2CTCTokenizer(os.path.join(parent_dir, 'czeck_phonem_vocab.json'), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# Load Feature Extractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
# Wrap the feature_extractor and the tokenizer into one class (thanks so much HuggingFace)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# Prepare our data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

The first component of XLSR-Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretraining and as stated in the [paper](https://arxiv.org/pdf/2006.13979.pdf) does not need to be fine-tuned anymore. 
Thus, we can set the `requires_grad` to `False` for all parameters of the *feature extraction* part.

In [None]:
# Load model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    # "facebook/wav2vec2-base-960h",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze the feature extractor
model.freeze_feature_extractor()

# Set to GPU
model.cuda()

# Get sampler
model_input_name = processor.feature_extractor.model_input_names[0]
sampler_train = trainer_pt_utils.LengthGroupedSampler(common_voice_train_10mn, batch_size=12, model_input_name=model_input_name)
sampler_val = trainer_pt_utils.LengthGroupedSampler(common_voice_validation, batch_size=12, model_input_name=model_input_name)

# Get Loader
train_loader = DataLoader(common_voice_train_10mn, batch_size=12, sampler=sampler_train, collate_fn=data_collator, num_workers=4)
valid_loader = DataLoader(common_voice_validation, batch_size=12, sampler=sampler_val, collate_fn=data_collator, num_workers=4)

#
learning_rate = 4e-4
n_epochs = 350

num_update_steps_per_epoch = len(train_loader)
max_steps = math.ceil(n_epochs * num_update_steps_per_epoch)
validation_freq = int(1*num_update_steps_per_epoch)
print_freq = int(1*num_update_steps_per_epoch)
scheduler_on_plateau_freq = int(num_update_steps_per_epoch)

# Optimizer
decay_parameters = trainer_pt_utils.get_parameter_names(model, [torch.nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Scheduler
num_warmup_steps = int(50 * num_update_steps_per_epoch) # Neccessary Number of steps to go from 0.0 to lr 
#warmup_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, max_steps)
warmup_scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, max_steps, lr_end=1e-7)
reduce_lr_plateau = None
## reduce_lr_plateau = ReduceLROnPlateau(optimizer, factor=0.6, patience=7) ## To define when warmup scheduler is finished

model.zero_grad() 
current_total_steps = 0
current_best_wer = 2.0

trainer = Trainer(model, processor, optimizer, warmup_scheduler, validation_freq, print_freq, num_warmup_steps, False, type_score='PER')
trainer.train(train_loader, valid_loader, n_epochs, path="/content/examples/model.pt")

# Result on Test Set

In [None]:
sampler_test = trainer_pt_utils.LengthGroupedSampler(common_voice_test, batch_size=12, model_input_name=model_input_name)
test_loader = DataLoader(common_voice_test, batch_size=12, sampler=sampler_test, collate_fn=data_collator, num_workers=4)
print(f"The final PER score on the test set is {trainer.compute_score(test_loader, "/content/examples/model.pt")}")