In [2]:
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/malaya-speech/malay-test.tar.gz
# !tar -zxf malay-test.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/resolve/main/malaya-speech/malaya-malay-test-set.json

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
from transformers import AutoModel
import malaya_speech
import json
import numpy as np
import unicodedata
import re
import itertools
from tqdm import tqdm

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [4]:
model = AutoModel.from_pretrained('mesolitica/conformer-tiny-ctc', trust_remote_code = True)

In [5]:
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download
import kenlm
import torchaudio
import torch
import math

In [6]:
HF_CTC_VOCAB = [
    '',
    'a',
    'b',
    'c',
    'd',
    'e',
    'f',
    'g',
    'h',
    'i',
    'j',
    'k',
    'l',
    'm',
    'n',
    'o',
    'p',
    'q',
    'r',
    's',
    't',
    'u',
    'v',
    'w',
    'x',
    'y',
    'z',
    '0',
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    ' ',
    '?',
    '_'
]
HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

DECIBEL = 2 * 20 * math.log10(torch.iinfo(torch.int16).max)
GAIN = pow(10, 0.05 * DECIBEL)

spectrogram_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000, n_fft=400, n_mels=80, hop_length=160)


def piecewise_linear_log(x):
    x = x * GAIN
    x[x > math.e] = torch.log(x[x > math.e])
    x[x <= math.e] = x[x <= math.e] / math.e
    return x


def melspectrogram(x):
    if isinstance(x, np.ndarray):
        x = torch.Tensor(x)
    x = spectrogram_transform(x).transpose(1, 0)
    return piecewise_linear_log(x)

lm = hf_hub_download('mesolitica/kenlm-pseudolabel-whisper-large-v3', 'out.binary')
kenlm_model = kenlm.Model(lm)
decoder = build_ctcdecoder(
    HF_CTC_VOCAB,
    kenlm_model,
    alpha=0.2,
    beta=1.0,
    ctc_token_idx=len(HF_CTC_VOCAB) - 1
)

In [7]:
mel = melspectrogram(np.zeros((16000 * 5,)))
inputs = {
    'inputs': mel.unsqueeze(0),
    'lengths': torch.tensor([len(mel)])
}
model(**inputs)

(tensor([[[-15.2894,  -0.8834,   0.9386,  ...,  -3.7483, -15.3320,   3.7056],
          [-16.8735,   2.9311,  -1.9171,  ...,  -1.3749, -16.8303,   4.7868],
          [-16.8715,   0.9151,  -1.1347,  ...,  -0.3149, -16.8906,   4.5809],
          ...,
          [-14.9632,   0.6289,  -1.2577,  ...,  -2.8886, -15.0917,   3.8999],
          [-16.8317,   2.8282,  -3.0190,  ...,  -2.0569, -16.8586,   4.4275],
          [-17.3873,   0.4810,  -2.5573,  ...,  -0.3014, -17.4308,   4.5122]]],
        grad_fn=<ViewBackward0>),
 tensor([125]))

In [8]:
with open('malaya-malay-test-set.json') as fopen:
    data = json.load(fopen)

len(data)

765

In [10]:
wer, cer = [], []

for i in tqdm(range(len(data))):
    
    if not data[i]['accept']:
        continue
        
    f = f'malay-test/{i}.wav'
    actual = data[i]['cleaned']
    
    y, _ = malaya_speech.load(f)
    
    mel = melspectrogram(y)
    inputs = {
        'inputs': mel.unsqueeze(0),
        'lengths': torch.tensor([len(mel)])
    }
    r = model(**inputs)
    logits = r[0].detach().numpy()
    out = decoder.decode_beams(logits[0], prune_history=True)
    out, lm_state, timesteps, logit_score, lm_score = out[0]
    
    out = preprocessing_text(out)
        
    actual = actual.lower()
    out = out.lower()
    
    wer.append(calculate_wer(actual, out))
    cer.append(calculate_cer(actual, out))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 765/765 [00:26<00:00, 28.95it/s]


In [11]:
np.mean(wer), np.mean(cer)

(0.21302693966628394, 0.0612581761581601)