The steps are mostly based on these refrences:
- [Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
- [Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO](https://github.com/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)

You can consult them if you want to see a more detailed procedure.

The second one also contains valuable hints on how to preprocess the persian text for our purpose.

In [1]:
# Download nessecary libraries
!pip install datasets==2.10.0 --quiet
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Loading the dataset

In [3]:
# If you wish your data to persist even when you shutdown colab, save your intermediate results to your google drive
# Then in the code you can change the saving path of files to ./drive/MyDrive/ path, which is your google drive disk
from os import path,system,mkdir
from google.colab import drive

drive.mount('/content/drive/')
if not path.exists('./drive/MyDrive/ASR_Colab'):
  mkdir('./drive/MyDrive/ASR_Colab')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
# modify the path to dataset if nessecary
dataet_path = './drive/MyDrive/ML_Project_ASR/dataset.zip'
if not path.exists('dataset'):
  system(f'unzip -qq "{dataet_path}" -d "/content/"')

In [6]:
# Load csv
# We use pandas for data import, and datasets lib to prepear our data.
# These two libs (pandas.Dataframe & datasets.Dataset) are convertable as shown bellow, so use whichever you find more convenient
import pandas as pd
from datasets import Dataset

transcripts = pd.read_csv('dataset/transcripts.csv')
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal


In [7]:
# Take a look at unique letters in our dataset
from functools import reduce
present_chars = reduce(lambda a, b: set((*a,*b)), list(ds['transcript']))
print(present_chars)

{'ﯿ', 'ﺛ', '۶', 'ق', '8', 'خ', '؟', 'ﺕ', 'ذ', 'ﮐ', 'ﺁ', 'ـ', 'ﻮ', '.', 'ﺖ', 'ﯾ', ',', 'ﺪ', 'م', 'ی', '۳', 'ﺍ', 'ج', 'ﺎ', 'ﻬ', 'ﻏ', 'ﻒ', 'ﻨ', 'س', 'ﭘ', '۰', 'ﻫ', '/', ')', '٥', 'ژ', 'ﺴ', 'M', 'ﮔ', 'ض', '٪', 'ُ', '«', 'ﺣ', 'ؤ', 'ل', 'پ', 'ﮏ', '-', 'ﻌ', 'ﻃ', 'ﺼ', 'ﻠ', 'V', '۴', 'ط', 'ﻓ', '۹', 'ﮑ', 'ّ', 'و', '…', 'أ', '6', '\u202b', 'ﻥ', 'ﺑ', 'ﺭ', 'ﻦ', 'چ', 'ﺲ', 'ﺩ', '۵', 'ر', 'آ', '(', 'ﺫ', 'ﻭ', 'َ', ' ', 'ﺯ', 'ﻧ', 'د', 'ا', 'ﻔ', 'ظ', 'ِ', 'ٔ', 'ئ', 'ى', 'ۀ', 'S', '1', 'ﻝ', 'ح', '\u200c', 'ن', '۷', '4', 'ْ', 'ﻣ', 'ه', 'ء', '!', '"', '7', 'غ', 'ﺘ', 'ز', 'ث', '\xa0', 'ﺥ', 'ف', 'ﻩ', '2', 'ﻢ', '“', '\xad', '٬', '5', '\u202c', 'ك', 'ﺤ', 'ش', '–', '٫', '؛', '\n', 'ٍ', '۸', 'ي', '9', 'ص', ':', 'ت', 'ﺟ', 'ﯽ', 'ﺶ', '»', '٨', 'ﻤ', '0', 'ﺗ', '۱', '۲', 'ﺮ', 'ﯼ', 'ﺨ', 'ک', '،', '”', 'ﭽ', 'ﻪ', 'ب', 'ﺷ', '\t', '3', 'ﺳ', 'گ', 'ﻡ', 'ع', 'ﻖ', 'ً'}


## Preprocessing text & audio

In [8]:
# Some of the listed chars are the same, but have different representations(like 'ب' & 'ﺑ')
# They should get combined(one of them gets mapped to the other)
# complete the following dict:
import re

char_mappings = {'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
                'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
                "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
                "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
                'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
                'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ"}

def multiple_replace(batch, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    batch['transcript'] = re.sub(pattern, lambda m: chars_to_mapping[m.group()], batch['transcript'])
    return batch

ds = ds.map(lambda batch: multiple_replace(batch,char_mappings))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [9]:
# Some chars don't have any sound, so they should get removed
# Don't remove the ' ' (space) though, as the model should learn to predict when each word ends
# handle the transcripts containing numbers as you deem nessecary
# complete the following list:
import string

char_removals = ['ِ','\u200c','(',')','!','،','\u202c','«','…','ٍ','\n','ـ'] + list(string.ascii_letters + string.digits)
def remove_special_characters(batch,char_removals):
    chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
    batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
    return batch

ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [10]:
# The resulting vocab(list of letters):
vocab = reduce(lambda a, b: set((*a,*b)), ds['transcript'])
print(vocab)

{'ﺛ', '۶', 'ق', 'خ', '؟', 'ﺕ', 'ذ', 'ﺁ', '.', ',', 'م', 'ی', '۳', 'ج', 'ﻬ', 'ﻏ', 'ﻒ', 'ﻨ', 'س', '۰', 'ﻫ', '/', '٥', 'ژ', '٪', 'ض', 'ُ', 'ل', 'ﺣ', 'پ', 'ﮏ', '-', 'ﻃ', 'ﺼ', 'ﻠ', 'ط', '۴', 'ﻓ', '۹', 'ﮑ', 'ّ', 'و', '\u202b', 'ﻦ', 'چ', 'ﺲ', '۵', 'ر', 'آ', 'ﺫ', 'َ', ' ', 'ﺯ', 'د', 'ا', 'ﻔ', 'ظ', 'ٔ', 'ئ', 'ﻝ', 'ح', 'ن', '۷', 'ْ', 'ﻣ', 'ه', 'ء', '"', 'غ', 'ز', 'ث', '\xa0', 'ﺥ', 'ف', 'ﻩ', '“', '\xad', '٬', 'ﺤ', 'ش', '–', '٫', '؛', '۸', ':', 'ت', 'ص', 'ﺟ', 'ﺶ', '»', '٨', 'ﺗ', '۱', '۲', 'ﺨ', 'ﯼ', 'ک', '”', 'ﭽ', 'ب', '\t', 'ﺳ', 'گ', 'ع', 'ﻖ', 'ً'}


In [11]:
# Wav2Vec requires some special tokens to be added to vocab
# We also replace ' '(space) with '|' for more visibility
# The vocab should get saved as a json file and later get used by the model
vocab_dict = {v: k for k, v in enumerate(vocab)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

import json
with open('./drive/MyDrive/ASR_Colab/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [12]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./drive/MyDrive/ASR_Colab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [13]:
# Tokenizing the transcripts and then load,convert to mono channel and resample audio files at 16 KHz
import librosa
import warnings

def prepare_dataset(batch):
  file_path = path.join('dataset','voices',batch['voice_filename'])
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    speech_array, sampling_rate = librosa.load(file_path,mono=True,sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

ds = ds.map(prepare_dataset)

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [34]:
# To reduce GPU memory usage, filter out voice samples that are too long:
max_input_length_in_sec = 15
ds = ds.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [35]:
ds = ds.train_test_split(0.2)

# A report on dataset length:
ds

DatasetDict({
    train: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 4611
    })
    test: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 1153
    })
})

In [37]:
# Save for later use
ds.save_to_disk("./drive/MyDrive/ASR_Colab/dataset.hf")

Flattening the indices:   0%|          | 0/4611 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4611 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1153 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1153 [00:00<?, ? examples/s]