In [1]:
from malaya_speech.utils import torch_featurization
import random
import torch
import malaya_speech
from malaya_speech.utils.char import HF_CTC_VOCAB
from conformer import ConformerConfig, ConformerEncoder
from dataclasses import dataclass, field

HF_CTC_VOCAB = HF_CTC_VOCAB + ['_']
HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

ConformerConfig.register_for_auto_class()
ConformerEncoder.register_for_auto_class()

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [2]:
config = ConformerConfig(
    input_dim=80,
    output_dim=len(HF_CTC_VOCAB),
    time_reduction_stride=4,
    conformer_input_dim=144,
    conformer_ffn_dim=576,
    conformer_num_layers=2,
    conformer_num_heads=4,
    conformer_depthwise_conv_kernel_size=31,
    conformer_dropout=0.0,
    pad_token_id=len(HF_CTC_VOCAB) - 1,
    ctc_loss_reduction='mean',
    ctc_zero_infinity=True,
)

In [3]:
encoder = ConformerEncoder(config)

In [4]:
global_stats = torch_featurization.GlobalStatsNormalization('../../../malay-stats.json')

In [5]:
y, sr = malaya_speech.load('../../../speech/example-speaker/husein-zolkepli.wav')
y2, sr = malaya_speech.load('../../../speech/example-speaker/shafiqah-idayu.wav')

In [6]:
mel = torch_featurization.melspectrogram(y)
mel = torch_featurization.piecewise_linear_log(mel)
mel2 = torch_featurization.melspectrogram(y2)
mel2 = torch_featurization.piecewise_linear_log(mel2)

In [7]:
text = ['nama saya husein bin zolkepli', 'nama saya shafiqah idayu']
text = [[HF_CTC_VOCAB_REV[c] for c in t] for t in text]

In [8]:
@dataclass
class DataCollatorCTCWithPadding:
     def __call__(self, features):
        inputs = [f['inputs'] for f in features]
        lengths = torch.tensor([len(f['inputs']) for f in features])
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first = True)
        labels = [torch.tensor(f['labels']) for f in features]
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first = True, padding_value = -100)
        return {
            'inputs': inputs,
            'lengths': lengths,
            'labels': labels,
        }

In [9]:
collator = DataCollatorCTCWithPadding()

In [10]:
features = [
    {'inputs': mel, 'labels': text[0]},
    {'inputs': mel2, 'labels': text[1]}
]
batch = collator(features)
batch

{'inputs': tensor([[[ 7.5720,  8.8585, 11.1099,  ..., 14.2707, 14.0441, 14.6217],
          [11.6262, 12.9127, 15.0601,  ..., 14.3029, 13.7059, 14.6847],
          [15.4666, 16.7531, 17.1559,  ..., 12.7871, 13.4699, 14.2108],
          ...,
          [19.2897, 20.5761, 18.5907,  ..., 13.4430, 14.3961, 14.1888],
          [19.9591, 21.2456, 20.8646,  ..., 13.4250, 13.9565, 14.0654],
          [15.8479, 17.1344, 15.3638,  ..., 12.1170, 12.7841, 11.9480]],
 
         [[13.5830, 14.8695, 16.0703,  ..., 16.1097, 15.8851, 15.7696],
          [17.2102, 18.4967, 19.5340,  ..., 16.7923, 16.7401, 17.0030],
          [19.9245, 21.2110, 21.4556,  ..., 17.6996, 17.4917, 17.0634],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]),
 'lengths': tensor([564, 352]),
 'labels': tensor([[  14,    1,   13,    1,   37,   19,   

In [11]:
%%time

encoder(**batch)

CPU times: user 101 ms, sys: 0 ns, total: 101 ms
Wall time: 10.6 ms


(tensor(12.3804, grad_fn=<MeanBackward0>),
 tensor([[[-0.2900,  0.1902,  0.7856,  ...,  0.2212, -0.7918, -0.4279],
          [-0.2372,  0.2666,  0.7868,  ...,  0.2798, -0.7645, -0.3327],
          [-0.2177,  0.2638,  0.7733,  ...,  0.3479, -0.7976, -0.3448],
          ...,
          [-0.1245,  0.6188,  0.6288,  ...,  0.6743, -0.5478, -0.4321],
          [-0.0517,  0.6362,  0.6020,  ...,  0.6486, -0.5827, -0.3974],
          [ 0.0039,  0.4909,  0.6196,  ...,  0.5959, -0.6213, -0.3628]],
 
         [[-0.3260,  0.2339,  0.8372,  ...,  0.2022, -0.7868, -0.4075],
          [-0.2500,  0.2438,  0.8036,  ...,  0.2405, -0.7820, -0.2837],
          [-0.2043,  0.2306,  0.7805,  ...,  0.2932, -0.7887, -0.3557],
          ...,
          [ 0.2808,  0.3881,  0.1645,  ..., -0.1860, -0.6413, -0.2572],
          [ 0.3108,  0.2726,  0.2344,  ..., -0.2173, -0.6638, -0.3587],
          [ 0.3807,  0.3251,  0.2512,  ..., -0.3206, -0.6454, -0.4048]]],
        grad_fn=<AddBackward0>),
 tensor([141,  88]))

In [12]:
total_params = sum(
    param.numel() for param in encoder.parameters()
)
total_params

1023448

In [13]:
encoder.save_pretrained('./out')

In [14]:
!ls -lh out

total 4.0M
-rw-r--r-- 1 husein husein  600 Jan  27 18:13 config.json
-rw-rw-r-- 1 husein husein 2.5K Jan  27 18:13 conformer.py
-rw-r--r-- 1 husein husein 4.0M Jan  27 18:13 model.safetensors


In [15]:
encoder.push_to_hub('huseinzol05/conformer-super-super-tiny', safe_serialization = True)

model.safetensors:   0%|          | 0.00/4.10M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/conformer-super-super-tiny/commit/7757057bfba5d0a88dcee47ef10f08758acfe535', commit_message='Upload ConformerEncoder', commit_description='', oid='7757057bfba5d0a88dcee47ef10f08758acfe535', pr_url=None, pr_revision=None, pr_num=None)