This notebook is inspired from a very good [HuggingFace Tutorial](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb#scrollTo=bTjNp2KUYAl8)

# pip install

In [None]:
!pip install phonemizer
!apt-get install espeak

Collecting phonemizer
[?25l  Downloading https://files.pythonhosted.org/packages/86/6d/fb1757f006b584469bc0b9d56209b2ac873420033133b7da58e49033862e/phonemizer-2.2.2-py3-none-any.whl (49kB)
[K     |██████▋                         | 10kB 15.3MB/s eta 0:00:01[K     |█████████████▎                  | 20kB 16.0MB/s eta 0:00:01[K     |███████████████████▉            | 30kB 13.0MB/s eta 0:00:01[K     |██████████████████████████▌     | 40kB 12.1MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.7MB/s 
Collecting segments
  Downloading https://files.pythonhosted.org/packages/1e/ae/02d31d73cfc3fa1dc74b7b7f14820fadc287e74406583d7af7b80fcaac41/segments-2.2.0-py2.py3-none-any.whl
Collecting csvw>=1.5.6
  Downloading https://files.pythonhosted.org/packages/8d/17/c915342268c458ac73a68d65e42d71d6e9a9eab4d1a10d03112f372513a6/csvw-1.10.1-py2.py3-none-any.whl
Collecting clldutils>=1.7.3
[?25l  Downloading https://files.pythonhosted.org/packages/78/b9/456cc0fa90dd0f0b5710a0e8e728

# notebook

In [1]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-a1221ea1-57ce-3085-c6ad-57e4698851c0)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa
!pip install jiwer

In [5]:
# Import libraries
from datasets import load_dataset, load_metric, ClassLabel, load_from_disk
import datasets
datasets.set_caching_enabled(False)

import torchaudio
import librosa
import torch

from dataclasses import dataclass, field

from typing import Any, Dict, List, Optional, Union

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer, AdamW, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup
from transformers import set_seed
from transformers import trainer_pt_utils
from transformers.trainer_pt_utils import DistributedTensorGatherer
from transformers.trainer_utils import EvalPrediction, denumpify_detensorize, PredictionOutput

from torch.utils.data.dataloader import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

import random
import math
import pandas as pd
import numpy as np

from IPython.display import display, HTML

import re
import json
import os
from tqdm.notebook import tqdm

# phonemizer
#from phonemizer import phonemize

# Utils

In [6]:
# text to phoneme
def text2phoneme(batch):
    batch["sentence"] = phonemize(batch["sentence"], language='cs', backend="espeak")
    return batch

# Visualisation
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

# Metrics PER
def NeedlemanWunschAlignScore(seq1, seq2, d, m, r, normalize=True):

    N1, N2 = len(seq1), len(seq2)

    # Fill up the errors
    tmpRes_ = [[None for x in range(N2 + 1)] for y in range(N1 + 1)]
    for i in range(N1 + 1):
        tmpRes_[i][0] = i * d
    for j in range(N2 + 1):
        tmpRes_[0][j] = j * d

    for i in range(N1):
        for j in range(N2):

            match = r if seq1[i] == seq2[j] else m
            v1 = tmpRes_[i][j] + match
            v2 = tmpRes_[i + 1][j] + d
            v3 = tmpRes_[i][j + 1] + d
            tmpRes_[i + 1][j + 1] = max(v1, max(v2, v3))

    i = j = 0
    res = -tmpRes_[N1][N2]
    if normalize:
        res /= float(N1)
    return res

def get_seq_PER(seqLabels, detectedLabels):
    return NeedlemanWunschAlignScore(seqLabels, detectedLabels, -1, -1, 0,
                                     normalize=True)
    
def generate_per_score(refs, hyps):
  score = 0.0
  for ref, hyp in zip(refs, hyps):
    score += get_seq_PER(ref.replace('[UNK]', ''), hyp.replace('[UNK]', ''))
  return score/len(refs)

# Preprocessing
chars_to_ignore_regex = '[\(\)]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    batch["text"] = batch["text"].replace('`', '’')
    return batch

# Vocabulary
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

# Audio file
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["text"]
    return batch

def resample(batch):
    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 48_000, 16_000)
    batch["sampling_rate"] = 16_000
    return batch

# Preparing dataset for training
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch
  
# Special Data Collator
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

# Metric
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# Input preparation
def prepare_inputs(inputs):
  for k, v in inputs.items():
    if isinstance(v, torch.Tensor):
      inputs[k] = v.cuda()
  return inputs

# Loss computation
def compute_loss(model, inputs, return_outputs=False):
  outputs = model(**inputs)
  loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
  return (loss, outputs) if return_outputs else loss

# Prediction Loop
def prediction_loop(data_loader, model, world_size):
  num_examples = len(data_loader.dataset)
  batch_size = data_loader.batch_size
  eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples,
                                                  make_multiple_of=batch_size)
  preds_gatherer = DistributedTensorGatherer(world_size, num_examples)
  labels_gatherer = DistributedTensorGatherer(world_size, num_examples)
  losses_host, preds_host, labels_host = None, None, None
  model.eval()

  for step, inputs in enumerate(data_loader):
    loss, logits, labels = prediction_step(model, inputs)
    losses = loss.repeat(batch_size)
    losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
    preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
    labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
    eval_losses_gatherer.add_arrays(trainer_pt_utils.nested_numpify(losses_host))
    preds_gatherer.add_arrays(trainer_pt_utils.nested_numpify(preds_host))
    labels_gatherer.add_arrays(trainer_pt_utils.nested_numpify(labels_host))
    losses_host, preds_host, labels_host = None, None, None
  
  eval_loss = eval_losses_gatherer.finalize()
  preds = preds_gatherer.finalize()
  labels_ids = labels_gatherer.finalize()

  preds_ids = np.argmax(preds, axis=-1)

  predicted_phonemes = processor.batch_decode(torch.from_numpy(preds_ids))
  true_phonemes = processor.batch_decode(torch.from_numpy(labels_ids))

  return generate_per_score(true_phonemes, predicted_phonemes)

# Prediction Single Batch
def prediction_step(model, inputs, label_names=["labels"]):
  has_labels = all(inputs.get(k) is not None for k in label_names)
  inputs = prepare_inputs(inputs)
  if hasattr(model, "config"):
    ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
  else:
    ignore_keys = []

  if has_labels:
    labels = trainer_pt_utils.nested_detach(tuple(inputs.get(name) for name in label_names))
    if len(labels) == 1:
        labels = labels[0]
  else:
    labels = None
  
  with torch.no_grad():
    if has_labels:
      loss, outputs = compute_loss(model, inputs, True)
      loss = loss.mean().detach()
      if isinstance(outputs, dict):
        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
      else:
        logits = outputs[1:]
    else:
      loss, outputs = None, model(**inputs)
      if isinstance(outputs, dict):
        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
      else:
        logits = outputs
  
  logits = trainer_pt_utils.nested_detach(logits)
  if len(logits) == 1:
    logits = logits[0]

  return (loss, logits, labels)




# Ukrainian Dataset
IF YOU DON'T HAVE ALREADY THE DATASET PREPROCESSED CONTINUE, OTHERWISE SKIP THIS SECTION

We are going to download the ukrainian dataset \
**Note**: Most likely, the common voice link has expired. In this case, just go to [Common Voice's dataset website](https://commonvoice.mozilla.org/en/datasets), select your language, *e.g.* `Ukrainian`, enter your email address to get the "*Download*" button, click right, and click `Copy link address` to fill it in the cell below.

In [3]:
common_voice = load_dataset("common_voice", "uk", data_dir="./cv-corpus-6.1-2020-12-11", split="train+validation")
common_voice_test = load_dataset("common_voice", "uk", data_dir="./cv-corpus-6.1-2020-12-11", split="test")

Using custom data configuration uk-ad9f7b76efa9f3a0
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/uk-ad9f7b76efa9f3a0/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)
Using custom data configuration uk-ad9f7b76efa9f3a0
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/uk-ad9f7b76efa9f3a0/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)


In [4]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

## Transform to phoneme

In [6]:
import json
vocab_phon = json.load(open("/content/phones_matches_with_space_corrected_uk.json", 'rb'))
inv_vocab_phon = {v: k for k,v in vocab_phon.items()}
file_tokens = open("/content/validated_phones_with_space_corrected.txt", 'rb')

In [7]:
id_to_txt = dict()
for f in file_tokens:
  line_split = f.decode('utf8').strip().split()
  id = line_split[0].split('_')[-1]
  tokens_ids = line_split[1:]
  txt = ''.join([inv_vocab_phon[x] for x in tokens_ids])
  #txt = re.sub("[\(\[].*?[\)\]]", "", txt) 
  id_to_txt[id] = txt

In [8]:
def text2phoneme(batch):
  ID = batch['path'].split('_')[-1].split(".")[0]
  batch['sentence'] = id_to_txt[ID]
  return batch

In [9]:
common_voice = common_voice.map(text2phoneme)
common_voice_test = common_voice_test.map(text2phoneme)

HBox(children=(FloatProgress(value=0.0, max=7271.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3235.0), HTML(value='')))




## Preprocess

In [10]:
show_random_elements(common_voice.remove_columns(['path']), num_examples=30)

Unnamed: 0,sentence
0,didu didu tʃʲueti
1,burʃtin zustritʃʲjaitsja u vihɭjadi ʌkruhɭix ʃmatotʃʲkiv natitʃʲnix form
2,ja ni nɑ jotu pri tsʲomu ne baʒɑju skrivditi tatɑrski ɑbʌ tjurkski narodi
3,f pʌnidiɭʌk
4,i tomu ja bi duʒy xotif ɕob vi jakras pridiɭiɭi tsʲomu pitɑnnju biɭʃu uvɑhu
5,zɑraz evnux skʌsuvav nɑ nʲohʌ tʃʲi dobri tʃʲuv zɑras to vixodit pizniʃy
6,zɑraz nikoɭi meni mɑju kɭʌpoti nahɑɭniʃi
7,toptʌ tɑm nimɑi priznɑtʃʲinnja ɕodʌ tʌrhʌveɭnʌhʌ tsɛntru
8,ne vidɑju pʌtʃʲufsja piriɭjakanij ʃypit is kʌrotunki ɕos meni zɑ nohu
9,knjazʲ ʌdvernufsja pid stinu jot proviv rukoju po tisɑnix kʌɭodax


We are going to preprocess the text and remove some special symbol `,.?!;` as we don't have any language model at the output

## Building Vocabulary

As we are going to use a CTC (as top layer), we are going to classify speech chunks into letters, so now we will extract all distinct letters and build our vocabulary from that.

In [11]:
common_voice = common_voice.rename_column('sentence', 'text')
common_voice_test = common_voice_test.rename_column('sentence', 'text')

In [12]:
vocab_train = common_voice.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Now we will create the union of all distinct letters from both dataset. We will do the same thing as when we are dealing with translation / generation task.

In [13]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 27,
 'a': 6,
 'b': 37,
 'd': 21,
 'e': 2,
 'f': 15,
 'h': 8,
 'i': 26,
 'j': 36,
 'k': 19,
 'l': 30,
 'm': 1,
 'n': 28,
 'o': 12,
 'p': 34,
 'r': 35,
 's': 23,
 't': 0,
 'u': 4,
 'v': 20,
 'w': 16,
 'x': 5,
 'y': 25,
 'z': 10,
 'ɑ': 29,
 'ɕ': 32,
 'ə': 24,
 'ɛ': 11,
 'ɡ': 7,
 'ɪ': 13,
 'ɭ': 9,
 'ɹ': 18,
 'ʃ': 33,
 'ʊ': 22,
 'ʌ': 17,
 'ʒ': 14,
 'ʲ': 31,
 'ː': 3}

In [14]:
# Adding the blank token, the unknown token and the padding token
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(f"Our final layer will have as output dimension {len(vocab_dict)}")

Our final layer will have as output dimension 40


In [15]:
# Now let's save our dictionary
parent_dir = ['/content/drive/MyDrive/speech_w2v', '/content/drive/MyDrive/3A/MVA/Speech & NLP/speech_w2v']
i = 0

with open(os.path.join(parent_dir[i], 'ukrainian_phonem_vocab.json'), 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## XLSR Wav2Vec 2.0 Features Extractor

In [16]:
# Now we are going to open and store the audio file (represented as a numpy array)
common_voice = common_voice.map(speech_file_to_array_fn, remove_columns=common_voice.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

HBox(children=(FloatProgress(value=0.0, max=7271.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3235.0), HTML(value='')))




In [17]:
# First we have to downsampled the original sample from 48 kHZ to 16kHZ
common_voice = common_voice.map(resample, num_proc=4)
common_voice_test = common_voice_test.map(resample, num_proc=4)

    

HBox(children=(FloatProgress(value=0.0, description='#3', max=1817.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='#1', max=1818.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='#2', max=1818.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='#0', max=1818.0, style=ProgressStyle(description_width='i…





    

HBox(children=(FloatProgress(value=0.0, description='#3', max=808.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#1', max=809.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#0', max=809.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#2', max=809.0, style=ProgressStyle(description_width='in…







# Load locally if already saved the preprocess file

# Split in 10mn, 1h, 8h

In [7]:
# Loading tokenizer
tokenizer = Wav2Vec2CTCTokenizer(os.path.join(parent_dir[i], 'ukrainian_phonem_vocab.json'), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# Load Feature Extractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
# Wrap the feature_extractor and the tokenizer into one class (thanks so much HuggingFace)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

NameError: ignored

In [19]:
# Split into train/dev
np.random.seed(42)
data = common_voice.train_test_split(test_size=0.2, seed=42)
common_voice_train, common_voice_validation = data['train'], data['test']

In [20]:
# Now let's shuffle data
common_voice_train = common_voice_train.shuffle(seed=42)

In [21]:
total_len_seconds = 0
indices_10mn = []
indices_1h = []
indices_8h = []
for i in tqdm(range(len(common_voice_train))):
  speech_array, sampling_rate = common_voice_train[i]["speech"], common_voice_train[i]["sampling_rate"]
  duration_audio = len(speech_array) * (1/sampling_rate)
  if total_len_seconds <= 600:
    indices_10mn.append(i)
  if total_len_seconds <= 3600:
    indices_1h.append(i)
  if total_len_seconds <= 36000:
    indices_8h.append(i)
  total_len_seconds += duration_audio
  if total_len_seconds > 36000:
    break

HBox(children=(FloatProgress(value=0.0, max=5816.0), HTML(value='')))




In [22]:
common_voice_train_10mn = common_voice_train.select(indices_10mn)
common_voice_train_1h = common_voice_train.select(indices_1h)
common_voice_train_8h = common_voice_train.select(indices_8h)

In [24]:
common_voice_train_10mn = common_voice_train_10mn.map(prepare_dataset, remove_columns=common_voice_train_10mn.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train_1h = common_voice_train_1h.map(prepare_dataset, remove_columns=common_voice_train_1h.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_train_8h = common_voice_train_8h.map(prepare_dataset, remove_columns=common_voice_train_8h.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_validation = common_voice_validation.map(prepare_dataset, remove_columns=common_voice_validation.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

    

HBox(children=(FloatProgress(value=0.0, description='#0', max=182.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#1', max=182.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#3', max=182.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#2', max=182.0, style=ProgressStyle(description_width='in…

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)






    

HBox(children=(FloatProgress(value=0.0, description='#0', max=46.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#1', max=46.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#3', max=46.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#2', max=46.0, style=ProgressStyle(description_width='ini…

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)






    

HBox(children=(FloatProgress(value=0.0, description='#0', max=102.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#1', max=102.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#3', max=101.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#2', max=102.0, style=ProgressStyle(description_width='in…

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)








In [25]:
common_voice_train_10mn.save_to_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_10mn.files')
common_voice_train_1h.save_to_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_1h.files')
common_voice_train_8h.save_to_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_8h.files')
common_voice_validation.save_to_disk('/content/drive/MyDrive/speech_w2v/validation_ukr_phonem.files')
common_voice_test.save_to_disk('/content/drive/MyDrive/speech_w2v/test_ukr_phonem.files')

# Training

In [8]:
common_voice_train_10mn = load_from_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_10mn.files')
common_voice_train_1h = load_from_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_1h.files')
common_voice_train_8h = load_from_disk('/content/drive/MyDrive/speech_w2v/train_ukr_phonem_8h.files')
common_voice_validation = load_from_disk('/content/drive/MyDrive/speech_w2v/validation_ukr_phonem.files')
common_voice_test = load_from_disk('/content/drive/MyDrive/speech_w2v/test_ukr_phonem.files')

In [9]:
parent_dir = ['/content/drive/MyDrive/speech_w2v', '/content/drive/MyDrive/3A/MVA/Speech & NLP/speech_w2v']
i = 0
# Loading tokenizer
tokenizer = Wav2Vec2CTCTokenizer(os.path.join(parent_dir[i], 'czeck_phonem_vocab.json'), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# Load Feature Extractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
# Wrap the feature_extractor and the tokenizer into one class (thanks so much HuggingFace)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [10]:
# Prepare our data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
# Prepare our metric (wer_metric)
wer_metric = load_metric("wer")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1764.0, style=ProgressStyle(description…




# 1h

The first component of XLSR-Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretraining and as stated in the [paper](https://arxiv.org/pdf/2006.13979.pdf) does not need to be fine-tuned anymore. 
Thus, we can set the `requires_grad` to `False` for all parameters of the *feature extraction* part.

Therefore, I had to play around a bit with different values for dropout, SpecAugment's masking dropout rate, layer dropout, and the learning rate until training seemed to be stable enough.

In [11]:
# Cell for training

# Set seed
set_seed(42)

# fname = '/content/wav2vec_small_960h.pt'
# checkpoint = torch.load(fname)
# args = checkpoint["args"]

# Load model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    # "facebook/wav2vec2-base-960h",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

# Freeze the feature extractor
model.freeze_feature_extractor()

#for param in model.wav2vec2.feature_projection.parameters():
    #param.requires_grad = False
#for param in model.wav2vec2.encoder.parameters():
    #param.requires_grad = False 

# Set to GPU
model.cuda()

# Get sampler
model_input_name = processor.feature_extractor.model_input_names[0]
sampler_train = trainer_pt_utils.LengthGroupedSampler(common_voice_train_1h, batch_size=12, model_input_name=model_input_name)
sampler_val = trainer_pt_utils.LengthGroupedSampler(common_voice_validation, batch_size=12, model_input_name=model_input_name)

# Get Loader
train_loader = DataLoader(common_voice_train_1h, batch_size=12, sampler=sampler_train, collate_fn=data_collator, num_workers=4)
valid_loader = DataLoader(common_voice_validation, batch_size=12, sampler=sampler_val, collate_fn=data_collator, num_workers=4)

#
learning_rate = 4e-4
n_epochs = 350

num_update_steps_per_epoch = len(train_loader)
max_steps = math.ceil(n_epochs * num_update_steps_per_epoch)
validation_freq = int(1*num_update_steps_per_epoch)
print_freq = int(1*num_update_steps_per_epoch)
scheduler_on_plateau_freq = int(num_update_steps_per_epoch)

# Optimizer
decay_parameters = trainer_pt_utils.get_parameter_names(model, [torch.nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Scheduler
num_warmup_steps = int(8.5 * num_update_steps_per_epoch) # Neccessary Number of steps to go from 0.0 to lr 
#warmup_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, max_steps)
warmup_scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, max_steps, lr_end=1e-7)
reduce_lr_plateau = None
## reduce_lr_plateau = ReduceLROnPlateau(optimizer, factor=0.6, patience=7) ## To define when warmup scheduler is finished

model.zero_grad() 
current_total_steps = 0
current_best_wer = 2.0

for epoch in range(n_epochs):
  print(f"EPOCH : {epoch}")
  tr_loss = 0.0

  epoch_step = 0
  for step, inputs in enumerate(train_loader):
    model.train()
    inputs = prepare_inputs(inputs)
    loss = compute_loss(model, inputs)

    loss.backward()

    tr_loss += loss.item()

    if hasattr(optimizer, "clip_grad_norm"):
      optimizer.clip_grad_norm(1.0)
    elif hasattr(model, "clip_grad_norm_"):
      model.clip_grad_norm_(1.0)
    
    optimizer.step()

    current_total_steps += 1
    epoch_step += 1
    #if current_total_steps < num_warmup_steps + 1:
      #warmup_scheduler.step()
    
    warmup_scheduler.step()

    if current_total_steps % print_freq == 0:
      print(f"Training Loss : {tr_loss/epoch_step}")
    
    # Initialize the lronplateau as soon as  we have finished the warmup 
    #if reduce_lr_plateau is None and current_total_steps > num_warmup_steps + 1:
      #reduce_lr_plateau = ReduceLROnPlateau(optimizer, factor=0.7, patience=5, verbose=1)
    
    model.zero_grad()

    if current_total_steps % validation_freq == 0:
      world_size = 1
      per_score = prediction_loop(valid_loader, model, world_size)
      eval_metric = per_score
      print(f"ACTUAL PER : {eval_metric}")

      if eval_metric < current_best_wer:
        print("Hooray! New best wer validation. Saving model")
        torch.save(model.state_dict(), os.path.join(parent_dir[i], 'ukr_per_1h.pt'))
        current_best_wer = eval_metric

      #if reduce_lr_plateau is not None:
        #reduce_lr_plateau.step(eval_metric)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1451.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1261920069.0, style=ProgressStyle(descr…




Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EPOCH : 0
Training Loss : 15.85017663637797
ACTUAL PER : 0.976071858029117
Hooray! New best wer validation. Saving model
EPOCH : 1
Training Loss : 4.711503624916077
ACTUAL PER : 0.9762634853702475
EPOCH : 2
Training Loss : 3.2654444058736165
ACTUAL PER : 0.9761689897790949
EPOCH : 3
Training Loss : 3.1779544075330097
ACTUAL PER : 0.9762678676826443
EPOCH : 4
Training Loss : 3.141517734527588
ACTUAL PER : 0.976168578126784
EPOCH : 5
Training Loss : 3.13096969127655
ACTUAL PER : 0.9761691665442181
EPOCH : 6
Training Loss : 3.118327724933624
ACTUAL PER : 0.9761759883292088
EPOCH : 7
Training Loss : 3.1018651525179544
ACTUAL PER : 0.9761996234978667
EPOCH : 8
Training Loss : 3.0698891321818036
ACTUAL PER : 0.9761678505603104
EPOCH : 9
Training Loss : 3.0624303181966144
ACTUAL PER : 0.9760607392417805
Hooray! New best wer validation. Saving model
EPOCH : 10
Training Loss : 3.0543327768643698
ACTUAL PER : 0.9761677304264031
EPOCH : 11
Training Loss : 3.034120774269104
ACTUAL PER : 0.97607056

# Test

In [12]:
model.load_state_dict(torch.load(os.path.join(parent_dir[i], 'ukr_per_1h.pt')))
sampler_test = trainer_pt_utils.LengthGroupedSampler(common_voice_test, batch_size=16, model_input_name=model_input_name)
test_loader = DataLoader(common_voice_test, batch_size=16, sampler=sampler_test, collate_fn=data_collator, num_workers=4)
per_score = prediction_loop(test_loader, model, world_size)
print(f"The final PER score on the test set is {per_score}")

The final PER score on the test set is 0.2142302041576593


Valid PER : 0.20076321384832446 / Test PER : 0.2142302041576593