In [1]:
%%capture
!pip install jiwer
!pip install gradio
!pip install librosa
!pip install fuzzywuzzy
!pip install torchaudio
!pip install tensorboardX
!pip install sentencepiece
!pip install accelerate -U
!pip install evaluate>=0.30
!pip install audiomentations
!pip install datasets==2.8.0
!pip install transformers==4.25.1
!pip install hazm==0.7.0

In [2]:
import os
import re
import hazm
import string
import torch
import warnings
import evaluate
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
from torch.utils.data import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets, load_from_disk

metric = evaluate.load("wer")

# Ignore warnings from Hugging Face
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

In [3]:
_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?", 
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
#     "ء",
]

# In case of farsi
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
        
    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",
        
    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(row, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    text = row['sentence']
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = text.lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)
            
    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None
    
    row['sentence'] = text
    return row

In [4]:
common_voice = load_dataset("mozilla-foundation/common_voice_11_0", "fa", split="test", use_auth_token="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")
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice = common_voice.map(normalizer)

common_voice

Found cached dataset common_voice_11_0 (/home/jupyter/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/fa/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


  0%|          | 0/10288 [00:00<?, ?ex/s]

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 10288
})

In [5]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [13]:
pipe_whisper_small_persian = pipeline("automatic-speech-recognition", "mohammadh128/whisper_small-fa_v03", tokenizer="openai/whisper-small", device=0)

In [None]:
reference = []
prediction = []

for i in tqdm(range(len(common_voice))):
    result = pipe_whisper_small_persian(common_voice[i]['path'])['text']


    # print(f"reference: {common_voice['sentence'][i]}")
    # print(f"prediction: {result}")
    # print('++++++++++++++++++++++++++++++++++++++++')


    reference.append(common_voice['sentence'][i])
    prediction.append(result)

In [12]:
print(100 * metric.compute(references=reference, predictions=prediction))

27.151572423185428
