In [1]:
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/fleurs-ms-my/fleurs-ms-my.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/fleurs-ms-my/malay-asr-test.json
# !tar -zxf fleurs-ms-my.tar.gz

In [2]:
from glob import glob
import json
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from datasets import Audio
from tqdm import tqdm
import malaya_speech
import unicodedata
import re
import itertools

sr = 16000
audio = Audio(sampling_rate=sr)

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
processor = AutoProcessor.from_pretrained('openai/whisper-small')
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    'openai/whisper-small',
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [4]:
_ = model.cuda()

In [5]:
with open('malay-asr-test.json') as fopen:
    data = json.load(fopen)
    
X = data['X']
Y = data['Y']

In [6]:
wer, cer = [], []

for i in tqdm(range(len(X))):
    
    
    y, _ = malaya_speech.load(X[i])
    actual = Y[i]
    
    inputs = processor([y], return_tensors = 'pt', sampling_rate = 16000)
    inputs['input_features'] = inputs['input_features'].type(torch.bfloat16).cuda()
    r = model.generate(inputs['input_features'], language='ms', return_timestamps=True)
    out = processor.tokenizer.decode(r[0], skip_special_tokens = True).strip()
    out = preprocessing_text(out)
        
    actual = actual.lower()
    out = out.lower()
    
    wer_ = calculate_wer(actual, out)
    if wer_ > 1:
        wer_ = 1.0
    
    cer_ = calculate_cer(actual, out)
    if cer_ > 1:
        cer_ = 1.0
    
    wer.append(wer_)
    cer.append(cer_)

100%|██████████| 622/622 [08:10<00:00,  1.27it/s]


In [7]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.21953142859857497, 0.06781078047622793)