In [1]:
from malaya_speech.utils import torch_featurization
import random
import torch
import malaya_speech
from malaya_speech.utils.char import HF_CTC_VOCAB
from conformer import ConformerConfig, ConformerEncoder
from dataclasses import dataclass, field

HF_CTC_VOCAB = HF_CTC_VOCAB + ['_']
HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

ConformerConfig.register_for_auto_class()
ConformerEncoder.register_for_auto_class()

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [2]:
config = ConformerConfig(
    input_dim=80,
    output_dim=len(HF_CTC_VOCAB),
    time_reduction_stride=4,
    conformer_input_dim=144,
    conformer_ffn_dim=576,
    conformer_num_layers=4,
    conformer_num_heads=4,
    conformer_depthwise_conv_kernel_size=31,
    conformer_dropout=0.0,
    pad_token_id=len(HF_CTC_VOCAB) - 1,
    ctc_loss_reduction='mean',
    ctc_zero_infinity=True,
)

In [3]:
encoder = ConformerEncoder(config)

In [4]:
global_stats = torch_featurization.GlobalStatsNormalization('../../../malay-stats.json')

In [5]:
y, sr = malaya_speech.load('../../../speech/example-speaker/husein-zolkepli.wav')
y2, sr = malaya_speech.load('../../../speech/example-speaker/shafiqah-idayu.wav')

In [6]:
srs = [4400, 5100, 6000, 8000, 10000]

def downsample(y, sr):
    s_sr = random.choice(srs)
    y_ = malaya_speech.resample(y, sr, s_sr)
    return malaya_speech.resample(y_, s_sr, sr)

In [7]:
mel = torch_featurization.melspectrogram(y)
mel = torch_featurization.piecewise_linear_log(mel)
mel2 = torch_featurization.melspectrogram(y2)
mel2 = torch_featurization.piecewise_linear_log(mel2)

In [8]:
text = ['nama saya husein bin zolkepli', 'nama saya shafiqah idayu']
text = [[HF_CTC_VOCAB_REV[c] for c in t] for t in text]

In [9]:
@dataclass
class DataCollatorCTCWithPadding:
     def __call__(self, features):
        inputs = [f['inputs'] for f in features]
        lengths = torch.tensor([len(f['inputs']) for f in features])
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first = True)
        labels = [torch.tensor(f['labels']) for f in features]
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first = True, padding_value = -100)
        return {
            'inputs': inputs,
            'lengths': lengths,
            'labels': labels,
        }

In [10]:
collator = DataCollatorCTCWithPadding()

In [11]:
features = [
    {'inputs': mel, 'labels': text[0]},
    {'inputs': mel2, 'labels': text[1]}
]
batch = collator(features)
batch

{'inputs': tensor([[[ 7.5720,  8.8585, 11.1099,  ..., 14.2707, 14.0441, 14.6217],
          [11.6262, 12.9127, 15.0601,  ..., 14.3029, 13.7059, 14.6847],
          [15.4666, 16.7531, 17.1559,  ..., 12.7871, 13.4699, 14.2108],
          ...,
          [19.2897, 20.5761, 18.5907,  ..., 13.4430, 14.3961, 14.1888],
          [19.9591, 21.2456, 20.8646,  ..., 13.4250, 13.9565, 14.0654],
          [15.8479, 17.1344, 15.3638,  ..., 12.1170, 12.7841, 11.9480]],
 
         [[13.5830, 14.8695, 16.0703,  ..., 16.1097, 15.8851, 15.7696],
          [17.2102, 18.4967, 19.5340,  ..., 16.7923, 16.7401, 17.0030],
          [19.9245, 21.2110, 21.4556,  ..., 17.6996, 17.4917, 17.0634],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]),
 'lengths': tensor([564, 352]),
 'labels': tensor([[  14,    1,   13,    1,   37,   19,   

In [12]:
%%time

encoder(**batch)

CPU times: user 301 ms, sys: 49.8 ms, total: 351 ms
Wall time: 38.2 ms


(tensor(10.8064, grad_fn=<MeanBackward0>),
 tensor([[[ 0.9041, -0.0273,  0.0188,  ..., -0.3469, -0.2756, -0.1187],
          [ 0.8908,  0.0109,  0.0936,  ..., -0.2962, -0.2617, -0.1145],
          [ 0.9789,  0.0587,  0.0310,  ..., -0.3359, -0.1292, -0.0338],
          ...,
          [ 0.6303, -0.1596,  0.4581,  ..., -0.6418, -0.3822, -0.1001],
          [ 0.6831, -0.1638,  0.3191,  ..., -0.7935, -0.3135, -0.0199],
          [ 0.7537, -0.2565,  0.3535,  ..., -0.7956, -0.3523, -0.1815]],
 
         [[ 0.8917, -0.0764,  0.0790,  ..., -0.3307, -0.2304, -0.2191],
          [ 0.8815,  0.0104,  0.1193,  ..., -0.2910, -0.1961, -0.1912],
          [ 0.9370, -0.0449,  0.0748,  ..., -0.3335, -0.1081, -0.1031],
          ...,
          [ 0.0496,  0.0847, -0.4263,  ..., -0.6085, -0.0200, -0.5530],
          [ 0.0962,  0.1990, -0.6072,  ..., -0.6245,  0.0495, -0.4702],
          [ 0.0914,  0.2125, -0.6547,  ..., -0.6186,  0.0089, -0.4559]]],
        grad_fn=<AddBackward0>),
 tensor([141,  88]))

In [13]:
total_params = sum(
    param.numel() for param in encoder.parameters()
)
total_params

1994872

In [14]:
encoder.save_pretrained('./out')

In [15]:
!ls -lh out

total 7.7M
-rw-r--r-- 1 husein husein  600 Jan  27 18:11 config.json
-rw-rw-r-- 1 husein husein 2.5K Jan  27 18:11 conformer.py
-rw-r--r-- 1 husein husein 7.7M Jan  27 18:11 model.safetensors


In [16]:
encoder.push_to_hub('huseinzol05/conformer-super-tiny', safe_serialization = True)

model.safetensors:   0%|          | 0.00/7.99M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/conformer-super-tiny/commit/31e0842733d74a95690d988eef8386ad376f439f', commit_message='Upload ConformerEncoder', commit_description='', oid='31e0842733d74a95690d988eef8386ad376f439f', pr_url=None, pr_revision=None, pr_num=None)