In [1]:
# !pip3 install git+https://github.com/mesolitica/malaya-speech@6aea111d8f17ceeaf53e5f95de8887610a93577c
# !pip3 install torchaudio --index-url https://download.pytorch.org/whl/cu118
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/fleurs-ms-my/fleurs-ms-my.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/fleurs-ms-my/malay-asr-test.json
# !tar -zxf fleurs-ms-my.tar.gz

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
import malaya_speech
import json
import numpy as np
import unicodedata
import re
import itertools
from tqdm import tqdm

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [3]:
model = malaya_speech.stt.transducer.huggingface(model = 'mesolitica/conformer-large-malay-whisper')

In [4]:
_ = model.eval()
_ = model.cuda()

In [5]:
with open('malay-asr-test.json') as fopen:
    data = json.load(fopen)
    
X = data['X']
Y = data['Y']

In [12]:
wer, cer = [], []

for i in tqdm(range(len(X))):
        
    y, _ = malaya_speech.load(X[i])
    out = model.beam_decoder([y])[0]
    actual = Y[i]
    
    out = preprocessing_text(out)
        
    actual = actual.lower()
    out = out.lower()
    
    wer.append(calculate_wer(actual, out))
    cer.append(calculate_cer(actual, out))

100%|██████████| 622/622 [19:35<00:00,  1.89s/it]


In [13]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.08376713097429746, 0.02548791948171514)