In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import whisper

model = whisper.load_model('small')

In [3]:
import malaya_speech
import numpy as np

def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [4]:
import malaya
from malaya.text.normalization import cardinal
import unicodedata
import re
import itertools

tokenizer = malaya.tokenizer.Tokenizer(hypen = False, parliament = False, time = False, time_pukul = False,
                                      temperature = False, distance = False, volume = False, duration = False,
                                      weight = False, date = False, money = False)


vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    tokenized = tokenizer.tokenize(string)
    string = ' '.join(tokenized)
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [5]:
import json

with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [6]:
from tqdm import tqdm

wer, cer = [], []

for i in tqdm(range(len(test_set['X']))):
    batch_y = [test_set['Y'][i]]
    y = malaya_speech.load(test_set['X'][i])[0]
    o = model.transcribe(y.astype('float32'), task = 'transcribe')
    pred = preprocessing_text(o['text'])
    wer.append(calculate_wer(test_set['Y'][i], pred))
    cer.append(calculate_cer(test_set['Y'][i], pred))

100%|█████████████████████████████████████████| 739/739 [07:52<00:00,  1.57it/s]


In [7]:
np.mean(wer), np.mean(cer)

(0.2436472703347558, 0.09136925680411448)

In [8]:
with open('/home/husein/malaya-speech/postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [9]:
wer, cer = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    y = malaya_speech.load(f'/home/husein/malaya-speech/malay-test/{i}.wav')[0]
    o = model.transcribe(y.astype('float32'), task = 'transcribe')
    pred = preprocessing_text(o['text'])
    
    wer.append(calculate_wer(malaya_malay[i]['cleaned'], pred))
    cer.append(calculate_cer(malaya_malay[i]['cleaned'], pred))

100%|█████████████████████████████████████████| 765/765 [05:11<00:00,  2.46it/s]


In [10]:
np.mean(wer), np.mean(cer)

(0.2818371132600382, 0.09588120693804351)

In [11]:
with open('/home/husein/malaya-speech/singlish-stt-test.json') as fopen:
    test_set = json.load(fopen)

In [12]:
wer, cer = [], []

for i in tqdm(range(len(test_set['X']))):
    batch_y = [test_set['Y'][i]]
    y = malaya_speech.load(test_set['X'][i])[0]
    o = model.transcribe(y.astype('float32'), task = 'transcribe')
    pred = preprocessing_text(o['text'])
    
    wer.append(calculate_wer(test_set['Y'][i], pred))
    cer.append(calculate_cer(test_set['Y'][i], pred))

100%|███████████████████████████████████████| 3579/3579 [53:37<00:00,  1.11it/s]


In [13]:
np.mean(wer), np.mean(cer)

(0.5971608337432316, 0.500389060166355)