In [1]:
# !pip3 install git+https://github.com/mesolitica/malaya-speech@6aea111d8f17ceeaf53e5f95de8887610a93577c
# !pip3 install torchaudio --index-url https://download.pytorch.org/whl/cu118
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/malaya-speech/malay-test.tar.gz
# !tar -zxf malay-test.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/malaya-speech/malaya-malay-test-set.json

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
import malaya_speech
import json
import numpy as np
import unicodedata
import re
import itertools
from tqdm import tqdm

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
model = malaya_speech.stt.transducer.huggingface(model = 'mesolitica/conformer-medium-malay-whisper')

In [4]:
_ = model.eval()
_ = model.cuda()

In [5]:
with open('malaya-malay-test-set.json') as fopen:
    data = json.load(fopen)

len(data)

765

In [6]:
wer, cer = [], []

for i in tqdm(range(len(data))):
    
    if not data[i]['accept']:
        continue
        
    f = f'malay-test/{i}.wav'
    actual = data[i]['cleaned']
        
    y, _ = malaya_speech.load(f)
    out = model.beam_decoder([y])[0]
    
    out = preprocessing_text(out)
        
    actual = actual.lower()
    out = out.lower()
    
    wer.append(calculate_wer(actual, out))
    cer.append(calculate_cer(actual, out))

100%|██████████| 765/765 [11:38<00:00,  1.10it/s]


In [8]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.09315638444736804, 0.024955598713609053)