In [1]:
# !pip3.8 install pyctcdecode==0.1.0 pypi-kenlm==0.1.20220713

In [2]:
import random
import torch
from itertools import groupby
import numpy as np
import malaya_speech
from malaya_speech.utils.char import decode as char_decode
from transformers import AutoModel
from conformer import HF_CTC_VOCAB, melspectrogram, ConformerConfig, ConformerEncoder
from dataclasses import dataclass, field
from huggingface_hub import hf_hub_download
from pyctcdecode import build_ctcdecoder
import kenlm

HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

ConformerConfig.register_for_auto_class()
ConformerEncoder.register_for_auto_class()

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
lm = hf_hub_download('mesolitica/kenlm-pseudolabel-whisper-large-v3', 'out.binary')

In [4]:
kenlm_model = kenlm.Model(lm)
decoder = build_ctcdecoder(
    HF_CTC_VOCAB,
    kenlm_model,
    alpha=0.2,
    beta=1.0,
    ctc_token_idx=len(HF_CTC_VOCAB) - 1
)

In [5]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('2M')
latest

'2M/checkpoint-121000'

In [6]:
model = AutoModel.from_pretrained(latest, trust_remote_code=True)

In [7]:
_ = model.eval()

In [8]:
SR = 16000

In [9]:
from glob import glob

files = glob('/home/husein/dev/malaya-speech/speech/example-speaker/*')
ys = []
for f in files:
    try:
        y, sr = malaya_speech.load(f)
        ys.append(y)
    except:
        pass

In [10]:
@dataclass
class DataCollatorCTCWithPadding:
     def __call__(self, features):
        inputs = [f['inputs'] for f in features]
        lengths = torch.tensor([len(f['inputs']) for f in features])
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first = True)
        if 'labels' in features[0]:
            labels = [torch.tensor(f['labels']) for f in features]
            labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first = True, padding_value = -100)
        else:
            labels = None
        return {
            'inputs': inputs,
            'lengths': lengths,
            'labels': labels,
        }
    
collator = DataCollatorCTCWithPadding()

In [13]:
features = []
for y in ys:
    mel = melspectrogram(y)
    features.append({'inputs': mel})
batch = collator(features)

In [14]:
%%time

r = model(**batch)

CPU times: user 580 ms, sys: 70.6 ms, total: 651 ms
Wall time: 56.5 ms


In [15]:
logits = r[0].detach().numpy()
argmax = np.argmax(logits, axis=-1)
results = []
for i in range(len(argmax)):
    tokens = ''.join([HF_CTC_VOCAB_INDEX[k] for k in argmax[i]])
    grouped_tokens = [token_group[0] for token_group in groupby(tokens)]
    filtered_tokens = list(filter(lambda token: token != '_', grouped_tokens))
    r = ''.join(filtered_tokens).strip()
    results.append(r)
results

['nma saya suptar ida nn',
 'scebut pecataan ani oke',
 'testin nama saya usin binzo kpl k',
 'takkan orang yang seperti abanm fakar itu mahu juga di menjaganya bai baik i orang yang tidak bertimbangberasa tu nnn',
 'sebagai pembangkan yang matang dan sejahtera pas akan menghadapiplihan raya umu dan tidak menumbangkerajaan dari pintu belakangkne',
 'pengatu caraan adalah swuatu keajah memberi arahan atau perinta kepada konpouter untuk menjalankan sesuatu jugas atau manda mana misin dali teran niekennn',
 'tolonm sebul ati sata kn',
 'apa kaba semua saya dowakan sedara dan setari sihat wala fiat hari i saya sekal lagi menemai searada']

In [16]:
for f, y in zip(files, ys):
    mel = melspectrogram(y)
    inputs = {
        'inputs': mel.unsqueeze(0),
        'lengths': torch.tensor([len(mel)])
    }
    r = model(**inputs)
    logits = r[0].detach().numpy()
    argmax = np.argmax(logits, axis=-1)
    tokens = ''.join([HF_CTC_VOCAB_INDEX[k] for k in argmax[0]])
    grouped_tokens = [token_group[0] for token_group in groupby(tokens)]
    filtered_tokens = list(filter(lambda token: token != '_', grouped_tokens))
    r = ''.join(filtered_tokens).strip()
    out = decoder.decode_beams(logits[0], prune_history=True)
    d_lm, lm_state, timesteps, logit_score, lm_score = out[0]
    print(f, r, d_lm)

/home/husein/dev/malaya-speech/speech/example-speaker/shafiqah-idayu.wav nama saya suptar idau nama saya suptaridayu
/home/husein/dev/malaya-speech/speech/example-speaker/mas-aisyah.wav scebut pecataan antir sebut perkataan anti
/home/husein/dev/malaya-speech/speech/example-speaker/husein-zolkepli.wav testing nama saya usin binzo kpl testing nama saya usin binzokpli
/home/husein/dev/malaya-speech/speech/example-speaker/female.wav takkan orang yang seperti abanm fakar itu mahu juga di menjaganya bai baik i orang yang tidak bertimbangberasa tu takkan orang yang seperti abang fakar itu mau juga dia menjaganya baik baik i orang yang tidak bertimbang rasa tu
/home/husein/dev/malaya-speech/speech/example-speaker/haqkiem.wav sebagai pembangkan yang matang dan sejahtera pas akan menghadapiplihan raya umu medan tidak menumbangkerajaan dari pintu belakang sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang
/home/hus

In [17]:
model.push_to_hub('mesolitica/conformer-2M-ctc', safe_serialization = True)

model.safetensors:   0%|          | 0.00/7.99M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/conformer-2M-ctc/commit/09ee6bda31271c0af3b6c76a1ebbfe8949812d63', commit_message='Upload ConformerEncoder', commit_description='', oid='09ee6bda31271c0af3b6c76a1ebbfe8949812d63', pr_url=None, pr_revision=None, pr_num=None)