In [None]:
%%capture
!pip install datasets==2.8.0
!pip install transformers==4.26
!pip install librosa
!pip install evaluate>=0.30
!pip install audiomentations
!pip install jiwer
!pip install gradio
!pip install torchaudio
!pip install tensorboardX
!pip install accelerate -U
!pip install hazm==0.7.0

In [None]:
!huggingface-cli login --token <"YOUR_HF_TOKEN">

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jupyter/.cache/huggingface/token
Login successful


In [None]:
import re
import hazm
import string
import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets, load_from_disk

In [None]:
_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
]

chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(row, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    text = row['sentence']
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = text.lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)

    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None

    row['sentence'] = text
    return row

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="persian", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="persian", task="transcribe")

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "fa", split="train")
common_voice["validation"] = load_dataset("mozilla-foundation/common_voice_11_0", "fa", split="validation")

common_voice = common_voice.map(normalizer)
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

Found cached dataset common_voice_11_0 (/home/jupyter/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/fa/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)
Found cached dataset common_voice_11_0 (/home/jupyter/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/fa/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


  0%|          | 0/26951 [00:00<?, ?ex/s]

  0%|          | 0/10288 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 26951
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10288
    })
})


In [None]:
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

common_voice = common_voice.map(prepare_dataset, num_proc=6)

        

#3:   0%|          | 0/4492 [00:00<?, ?ex/s]

#1:   0%|          | 0/4492 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/4492 [00:00<?, ?ex/s]

   

#0:   0%|          | 0/4492 [00:00<?, ?ex/s]

#5:   0%|          | 0/4491 [00:00<?, ?ex/s]

#4:   0%|          | 0/4492 [00:00<?, ?ex/s]

        

#3:   0%|          | 0/1715 [00:00<?, ?ex/s]

#0:   0%|          | 0/1715 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/1715 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/1714 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/1714 [00:00<?, ?ex/s]

#1:   0%|          | 0/1715 [00:00<?, ?ex/s]

In [None]:
common_voice = common_voice.remove_columns(['audio', 'sentence'])

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", use_cache = False)

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="whisper_small-fa_v01",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    greater_is_better=False,
)

PyTorch: setting up devices


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 26951
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 241734912


Step,Training Loss,Validation Loss,Wer
500,1.1796,0.504016,57.715051
1000,0.3412,0.414977,45.9097
1500,0.2699,0.378401,41.061453
2000,0.252,0.342005,39.632134
2500,0.2229,0.313812,37.457937
3000,0.2011,0.300456,34.493123
3500,0.1918,0.276289,32.461041
4000,0.1732,0.262007,31.850109
4500,0.1634,0.254489,31.000686
5000,0.161,0.250763,30.688686


***** Running Evaluation *****
  Num examples = 10288
  Batch size = 4
Saving model checkpoint to mohammadh128/whisper_small-fa_v01/checkpoint-500
Configuration saved in mohammadh128/whisper_small-fa_v01/checkpoint-500/config.json
Configuration saved in mohammadh128/whisper_small-fa_v01/checkpoint-500/generation_config.json
Model weights saved in mohammadh128/whisper_small-fa_v01/checkpoint-500/pytorch_model.bin
Feature extractor saved in mohammadh128/whisper_small-fa_v01/checkpoint-500/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 10288
  Batch size = 4
Saving model checkpoint to mohammadh128/whisper_small-fa_v01/checkpoint-1000
Configuration saved in mohammadh128/whisper_small-fa_v01/checkpoint-1000/config.json
Configuration saved in mohammadh128/whisper_small-fa_v01/checkpoint-1000/generation_config.json
Model weights saved in mohammadh128/whisper_small-fa_v01/checkpoint-1000/pytorch_model.bin
Feature extractor saved in mohammadh128/whisper_small-fa_v01/ch

TrainOutput(global_step=5000, training_loss=0.3156125183105469, metrics={'train_runtime': 47569.336, 'train_samples_per_second': 0.42, 'train_steps_per_second': 0.105, 'total_flos': 5.7717080064e+18, 'train_loss': 0.3156125183105469, 'epoch': 0.74})