# Speech recognition for spoken Afrikaans/isiXhosa

This notebook is based on the *XLS-R fine-tuning* [notebook](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb#scrollTo=1XZ-kjweyTy_).

Author: Lucas Meyer

### Install python libraries and git large file system

In [None]:
# %%capture
!pip3 install -r requirements.txt
!apt install git-lfs

In [9]:
import json
import torch
import numpy as np
import IPython.display as ipd

from dataclasses import dataclass
from typing import Dict, List, Union

import evaluate

from transformers import Trainer, TrainingArguments
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoModelForCTC, Wav2Vec2CTCTokenizer

2023-08-29 00:43:41.600739: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Log-in to hugging face hub

Use the following token with **write** permissions:
 - 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

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 2. Data

### 2.1 Load and preprocess data

In [2]:
import os
import pandas as pd
from load_Fleurs_ASR import load_Fleurs_ASR
from load_High_Quality_TTS import load_High_Quality_TTS
from load_Lwazi_ASR import load_Lwazi_ASR
from load_NCHLT import load_NCHLT

csv_entries = []
csv_entries += load_Fleurs_ASR()
csv_entries += load_High_Quality_TTS()
csv_entries += load_Lwazi_ASR()
csv_entries += load_NCHLT()

DATA_DIR = "asr_dataset"
metadata = pd.DataFrame(csv_entries, columns=['file_name', 'transcription'])
metadata.to_csv(path_or_buf=os.path.join(DATA_DIR, "metadata.csv"), sep=",", index=False)

2927
2420


100%|██████████| 200/200 [00:00<00:00, 8269.12it/s]
100%|██████████| 210/210 [00:00<00:00, 4283.84it/s]
100%|██████████| 210/210 [00:00<00:00, 921.89it/s] 


63131


100%|██████████| 210/210 [00:00<00:00, 939.99it/s]


3002


100%|██████████| 209/209 [00:00<00:00, 511.14it/s]


43881


100%|██████████| 209/209 [00:00<00:00, 1148.75it/s]


2770


In [3]:
from datasets import load_dataset

dataset = load_dataset("audiofolder", data_dir=DATA_DIR)

Resolving data files:   0%|          | 0/104164 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23804 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/8835 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/104165 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/23805 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/8836 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
dataset.push_to_hub(f"lucas-meyer/afrikaans-isixhosa-asr-dataset")

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/184 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

RuntimeError: Error while uploading 'data/train-00018-of-00184-91959b4a7daca36f.parquet' to the Hub.

In [19]:
from datasets import Audio

train_set = dataset["train"].cast_column("audio", Audio(sampling_rate=16_000)).rename_column("transcription", "sentence")
val_set = dataset["validation"].cast_column("audio", Audio(sampling_rate=16_000)).rename_column("transcription", "sentence")
test_set = dataset["test"].cast_column("audio", Audio(sampling_rate=16_000)).rename_column("transcription", "sentence")

In [20]:
train_set[0]

{'audio': {'path': '/home/kiff/Desktop/Speech-Recognition-Afrikaans-isiXhosa/src/asr_dataset/data/train/10001513089895169437.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.58223915e-05, -8.10623169e-06,  1.15036964e-05]),
  'sampling_rate': 16000},
 'sentence': 'utshintsho olucetywayo sele lwapasiswa zizo zombini izindlu zowiso mthetho ngo2011'}

## 3 Prepare for training
### 3.1 Create tokenizer for our data
#### 3.1.1 Create vocabulary

In [21]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = train_set.map(extract_all_chars,
                              batched=True, batch_size=-1,
                              keep_in_memory=True,
                              remove_columns=train_set.column_names)

vocab_val = val_set.map(extract_all_chars,
                          batched=True, batch_size=-1,
                          keep_in_memory=True,
                          remove_columns=val_set.column_names)

vocab_test = test_set.map(extract_all_chars,
                            batched=True, batch_size=-1,
                            keep_in_memory=True,
                            remove_columns=test_set.column_names)

# Get list for vocab of train/val/test
vocab_list = list(set(vocab_train["vocab"][0]) |
                  set(vocab_test["vocab"][0]) |
                  set(vocab_val["vocab"][0]))

# Get dict for vocab of train/val/test
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

Map:   0%|          | 0/104164 [00:00<?, ? examples/s]

Map:   0%|          | 0/23804 [00:00<?, ? examples/s]

Map:   0%|          | 0/8835 [00:00<?, ? examples/s]

#### 3.1.2 Save vocabulary and create tokenizer

In [23]:
# Save vocabulary file
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

repo_name = "wav2vec2-xls-r-300m-af-xh-all"
# repo_name = input("To what directory would you like to save your tokenizer?")
# tokenizer.push_to_hub(repo_name)

### 3.2 Prepare dataset using Wav2Vec processor

In [24]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    batch["labels"] = processor(text=batch["sentence"]).input_ids
    return batch

train_set = train_set.map(prepare_dataset, remove_columns=train_set.column_names)
val_set = val_set.map(prepare_dataset, remove_columns=val_set.column_names)
test_set = test_set.map(prepare_dataset, remove_columns=test_set.column_names)

Map:   0%|          | 0/104164 [00:00<?, ? examples/s]

Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x7f988cb2fd00>:
Traceback (most recent call last):
  File "/home/kiff/.local/lib/python3.10/site-packages/soundfile.py", line 1246, in vio_read
    data_read = file.readinto(buf)
KeyboardInterrupt: 


### 3.3 Create collator with padding

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

## 4. Load pretrained model

### 4.1 Create and DL model

In [None]:
# Download model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

# Freeze feature exctraction weights
model.freeze_feature_encoder()

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

repo_name = "wav2vec2-xls-r-300m-af-xh-all"

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

### 4.2 Prepare model for training

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer_metric = evaluate.load("wer")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=20,
    gradient_checkpointing=True,
    # fp16=True,
    fp16=False,
    save_steps=400,
    eval_steps=400,
    logging_steps=400,
    learning_rate=3e-4,
    warmup_steps=500,
    save_total_limit=2,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=processor.feature_extractor,
)

Cloning https://huggingface.co/lucas-meyer/wav2vec2-xls-r-300m-af-xh-all into local empty directory.


### 4.4 TRAIN

In [None]:
trainer.train()

ValueError: Can only automatically infer lengths for datasets whose items are dictionaries with an 'input_values' key.

### 4.5 Load pre-trained model

In [None]:
repo_name = "wav2vec2-xls-r-300m-af-xh-all"
model = AutoModelForCTC.from_pretrained(f"lucas-meyer/{repo_name}")
processor = Wav2Vec2Processor.from_pretrained(f"lucas-meyer/{repo_name}")

### 4.6 Use model for test predictions

In [None]:
# for i in range(20):
#     input_dict = processor(test_set[i]["input_values"], 
#                             sampling_rate=16000,
#                             return_tensors="pt", 
#                             padding=True)

#     logits = model(input_dict.input_values).logits
#     logits = logits.detach()
#     pred_ids = torch.argmax(logits, dim=-1)[0]
    
#     pred = processor.decode(pred_ids)
#     true = test_set_copy[i]["sentence"].lower()
    
#     print(f"Test {i}:")
#     print(f"  - pred: {pred}")
#     print(f"  - true: {true}\n")

In [None]:
# ipd.Audio(data=test_set_copy[15]["audio"]["array"], autoplay=False, rate=16000)