In [None]:
%%capture
!pip install datasets==2.8.0
!pip install transformers==4.25.1
!pip install librosa
!pip install evaluate>=0.30
!pip install audiomentations
!pip install jiwer
!pip install gradio
!pip install torchaudio
!pip install tensorboardX
!pip install accelerate -U
!pip install hazm==0.7.0

In [None]:
!huggingface-cli login --token <"YOUR_HF_TOKEN">

In [None]:
import re
import hazm
import string
import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets

In [None]:
_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
]

chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(row, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    text = row['sentence']
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = text.lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)

    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None

    row['sentence'] = text
    return row

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()
common_voice_augmented = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "fa", split="train")
common_voice["validation"] = load_dataset("mozilla-foundation/common_voice_11_0", "fa", split="validation")

common_voice = common_voice.map(normalizer)
common_voice_augmented["train"] = common_voice["train"]

print(common_voice)

Found cached dataset common_voice_11_0 (/home/jupyter/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/fa/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)
Found cached dataset common_voice_11_0 (/home/jupyter/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/fa/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


  0%|          | 0/26951 [00:00<?, ?ex/s]

  0%|          | 0/10288 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 26951
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 10288
    })
})


In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice_augmented = common_voice_augmented.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 26951
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10288
    })
})


In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="persian", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="persian", task="transcribe")

In [None]:
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 من برات یادداشت گذاشتم
Decoded w/ special:    <|startoftranscript|><|fa|><|transcribe|><|notimestamps|>من برات یادداشت گذاشتم<|endoftext|>
Decoded w/out special: من برات یادداشت گذاشتم
Are equal:             True


In [None]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
common_voice_augmented = common_voice_augmented.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
from audiomentations import (
    Compose,
    AddGaussianNoise,
    TimeStretch,
    PitchShift,
    AddGaussianSNR,
    GainTransition,
    HighPassFilter,
    LowPassFilter)

def prepare_dataset_augmented(batch):
    augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.7),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    AddGaussianSNR(min_snr_in_db=5.0,max_snr_in_db=40.0,p=0.5),
    GainTransition(min_gain_in_db=-100, max_gain_in_db=100, p=0.5),
    HighPassFilter(min_cutoff_freq=1000,max_cutoff_freq=1000, p=0.5),
    LowPassFilter(min_cutoff_freq=1000,max_cutoff_freq=1000,min_rolloff=24,max_rolloff=24,p=0.5),
    ])

    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]


    # add augment Transforms
    augmented_array = augment(samples=audio["array"], sample_rate=16000)
    audio["array"] = augmented_array

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
# _common_voice = common_voice.map(prepare_dataset, num_proc=2)

_common_voice_augmented = DatasetDict()
_common_voice_augmented["train"] = common_voice_augmented["train"].map(prepare_dataset_augmented, num_proc=2)

    

#1:   0%|          | 0/13475 [00:00<?, ?ex/s]

#0:   0%|          | 0/13476 [00:00<?, ?ex/s]

In [None]:
import IPython.display as ipd
import numpy as np
import random

print("Target text:", _common_voice['train'][0]["sentence"])
print("Input array shape:", np.asarray(_common_voice['train'][0]["audio"]["array"]).shape)
print("Sampling rate:", _common_voice['train'][0]['audio']["sampling_rate"])

ipd.Audio(data=np.asarray(_common_voice['train'][0]["audio"]["array"]), autoplay=True, rate=16000)

Target text: من برات یادداشت گذاشتم
Input array shape: (62976,)
Sampling rate: 16000


In [None]:
import IPython.display as ipd
import numpy as np
import random

print("Target text:", _common_voice_augmented['train'][0]["sentence"])
print("Input array shape:", np.asarray(_common_voice_augmented['train'][0]["audio"]["array"]).shape)
print("Sampling rate:", _common_voice_augmented['train'][0]['audio']["sampling_rate"])

ipd.Audio(data=np.asarray(_common_voice_augmented['train'][0]["audio"]["array"]), autoplay=True, rate=16000)

Target text: من برات یادداشت گذاشتم
Input array shape: (62976,)
Sampling rate: 16000


In [None]:
print(_common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 26951
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 10288
    })
})


In [None]:
_common_voice = _common_voice.remove_columns(['audio', 'sentence'])
_common_voice_augmented = _common_voice_augmented.remove_columns(['audio', 'sentence'])

print(_common_voice)

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 26951
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 10288
    })
})


In [None]:
_common_voice_final = _common_voice
_common_voice_final['train'] = concatenate_datasets([_common_voice['train'], _common_voice_augmented['train']])
_common_voice_final = _common_voice_final.shuffle(seed=42)

In [None]:
_common_voice_final

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 53902
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 10288
    })
})

In [None]:
_common_voice_final.push_to_hub("mohammadh128/common_voice_fa_preprocessed_and_augmented_training_and_evaluation_11_0")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/104 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/20 [00:00<?, ?it/s]