In [1]:
# !wget https://huggingface.co/huseinzol05/language-model-bahasa-manglish-combined/resolve/main/model.klm -O language-model-bahasa-manglish-combined.kelm
# !pip3 install pyctcdecode==0.1.0 pypi-kenlm==0.1.20210121

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import torch

In [4]:
import transformers
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
    set_seed,
    AutoModelForCTC,
    TFWav2Vec2ForCTC,
    TFWav2Vec2PreTrainedModel,
    Wav2Vec2PreTrainedModel,
    AutoConfig,
)

In [5]:
model = Wav2Vec2ForCTC.from_pretrained('mesolitica/wav2vec2-xls-r-300m-mixed')

In [6]:
import string
import json

CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']
vocab_dict = {v: k for k, v in enumerate(CTC_VOCAB)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open("ctc-vocab-export.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer(
    "ctc-vocab-export.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [7]:
import soundfile as sf
import librosa
from glob import glob
import numpy as np

In [8]:
with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [9]:
ys = [librosa.load(f, sr = 16000)[0] for f in test_set['X'][:3]]

In [10]:
y, _ = librosa.load('husein-zolkepli.wav', sr = 16000)

In [11]:
def norm_audio(x):
    return (x - x.mean()) / np.sqrt(x.var() + 1e-7)

def sequence_1d(
    seq, maxlen=None, padding: str = 'post', pad_int=0, return_len=False
):
    if padding not in ['post', 'pre']:
        raise ValueError('padding only supported [`post`, `pre`]')

    if not maxlen:
        maxlen = max([len(s) for s in seq])

    padded_seqs, length = [], []
    for s in seq:
        if isinstance(s, np.ndarray):
            s = s.tolist()
        if padding == 'post':
            padded_seqs.append(s + [pad_int] * (maxlen - len(s)))
        if padding == 'pre':
            padded_seqs.append([pad_int] * (maxlen - len(s)) + s)
        length.append(len(s))
    if return_len:
        return np.array(padded_seqs), length
    return np.array(padded_seqs)

batch, lens = sequence_1d([y] + ys,return_len=True)
attentions = [[1] * l for l in lens]
attentions = sequence_1d(attentions)
normed_input_values = []

for vector, length in zip(batch, attentions.sum(-1)):
    normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
    if length < normed_slice.shape[0]:
        normed_slice[length:] = 0.0

    normed_input_values.append(normed_slice)
    
normed_input_values = np.array(normed_input_values)

In [12]:
_ = model.eval()

In [14]:
o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)), 
             attention_mask = torch.from_numpy(attentions))

In [15]:
o_pt = o_pt.logits.detach().numpy()
pred_ids = np.argmax(o_pt, axis = -1)
tokenizer.batch_decode(pred_ids)

['testing nama saya hussein bin zolkaple',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf oftik',
 'gymnastik s dan joas mempunyai matlamat yang sama menjadikan sukan gybnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju']

In [16]:
from pyctcdecode import build_ctcdecoder
import kenlm

unique_vocab = list(vocab_dict.keys())
unique_vocab[-3] = ' ' 
unique_vocab[-2] = '?'
unique_vocab[-1] = '_'
kenlm_model = kenlm.Model('language-model-bahasa-manglish-combined.kelm')
decoder = build_ctcdecoder(
    unique_vocab,
    kenlm_model,
    alpha=0.2,
    beta=1.0,
    ctc_token_idx=tokenizer.pad_token_id
)

In [17]:
for k in range(len(o_pt)):
    out = decoder.decode_beams(o_pt[k], prune_history=True)
    d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
    print(k, d_lm2)

0 testing nama saya hussein bin zolkapli
1 ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik
2 ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik
3 gymnastik as dan joas mempunyai matlamat yang sama menjadikan sukan gimnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju


In [18]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [19]:
_ = model.cuda()

In [20]:
import malaya_speech
from scipy.special import log_softmax

In [23]:
from tqdm import tqdm

wer, cer = [], []
wer_lm, cer_lm = [], []

batch_size = 2
for i in tqdm(range(0, len(test_set['X']), batch_size)):
    batch_y = test_set['Y'][i: i + batch_size]
    ys = [malaya_speech.load(f)[0] for f in test_set['X'][i: i + batch_size]]
    batch, lens = sequence_1d(ys,return_len=True)
    attentions = [[1] * l for l in lens]
    attentions = sequence_1d(attentions)
    normed_input_values = []

    for vector, length in zip(batch, attentions.sum(-1)):
        normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
        if length < normed_slice.shape[0]:
            normed_slice[length:] = 0.0

        normed_input_values.append(normed_slice)
    
    normed_input_values = np.array(normed_input_values)
    o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)).cuda(), 
             attention_mask = torch.from_numpy(attentions).cuda())
    o_pt = o_pt.logits.detach().cpu().numpy()
    o_pt = log_softmax(o_pt, axis = -1)
    pred_ids = np.argmax(o_pt, axis = -1)
    pred = tokenizer.batch_decode(pred_ids)
    for k in range(len(o_pt)):
        out = decoder.decode_beams(o_pt[k], prune_history=True)
        d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
        
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))
        
        wer_lm.append(calculate_wer(batch_y[k], d_lm2))
        cer_lm.append(calculate_cer(batch_y[k], d_lm2))

100%|█████████████████████████████████████████| 370/370 [01:36<00:00,  3.85it/s]


In [24]:
np.mean(wer), np.mean(cer), np.mean(wer_lm), np.mean(cer_lm)

(0.23738612595212136,
 0.07055478006684142,
 0.1716938954303812,
 0.05916313167919988)

In [25]:
with open('postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [26]:
wer, cer = [], []
wer_lm, cer_lm = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    batch_y = [malaya_malay[i]['cleaned']]
    ys = [malaya_speech.load(f)[0] for f in [f'malay-test/{i}.wav']]
    batch, lens = sequence_1d(ys,return_len=True)
    attentions = [[1] * l for l in lens]
    attentions = sequence_1d(attentions)
    normed_input_values = []

    for vector, length in zip(batch, attentions.sum(-1)):
        normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
        if length < normed_slice.shape[0]:
            normed_slice[length:] = 0.0

        normed_input_values.append(normed_slice)
    
    normed_input_values = np.array(normed_input_values)
    o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)).cuda(), 
             attention_mask = torch.from_numpy(attentions).cuda())
    o_pt = o_pt.logits.detach().cpu().numpy()
    o_pt = log_softmax(o_pt, axis = -1)
    pred_ids = np.argmax(o_pt, axis = -1)
    pred = tokenizer.batch_decode(pred_ids)
    for k in range(len(o_pt)):
        out = decoder.decode_beams(o_pt[k], prune_history=True)
        d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
        
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))
        
        wer_lm.append(calculate_wer(batch_y[k], d_lm2))
        cer_lm.append(calculate_cer(batch_y[k], d_lm2))

100%|█████████████████████████████████████████| 765/765 [00:20<00:00, 37.30it/s]


In [27]:
np.mean(wer), np.mean(cer), np.mean(wer_lm), np.mean(cer_lm)

(0.1946551289436665,
 0.04775798989091143,
 0.12849904267888457,
 0.0357602212596816)

In [28]:
with open('singlish-test.json') as fopen:
    singlish = json.load(fopen)

In [30]:
wer, cer = [], []
wer_lm, cer_lm = [], []

for i in tqdm(range(len(singlish))):
    
    batch_y = [singlish[i]]
    ys = [malaya_speech.load(f)[0] for f in [f'singlish-test/{i}.wav']]
    batch, lens = sequence_1d(ys,return_len=True)
    attentions = [[1] * l for l in lens]
    attentions = sequence_1d(attentions)
    normed_input_values = []

    for vector, length in zip(batch, attentions.sum(-1)):
        normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
        if length < normed_slice.shape[0]:
            normed_slice[length:] = 0.0

        normed_input_values.append(normed_slice)
    
    normed_input_values = np.array(normed_input_values)
    o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)).cuda(), 
             attention_mask = torch.from_numpy(attentions).cuda())
    o_pt = o_pt.logits.detach().cpu().numpy()
    o_pt = log_softmax(o_pt, axis = -1)
    pred_ids = np.argmax(o_pt, axis = -1)
    pred = tokenizer.batch_decode(pred_ids)
    for k in range(len(o_pt)):
        out = decoder.decode_beams(o_pt[k], prune_history=True)
        d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]
        
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))
        
        wer_lm.append(calculate_wer(batch_y[k], d_lm2))
        cer_lm.append(calculate_cer(batch_y[k], d_lm2))

100%|███████████████████████████████████████| 3579/3579 [01:44<00:00, 34.27it/s]


In [31]:
np.mean(wer), np.mean(cer), np.mean(wer_lm), np.mean(cer_lm)

(0.1275885951545911,
 0.049492497930946455,
 0.09682029107142659,
 0.042727603734778574)