In [2]:
import random
import torch
import malaya_speech
from conformer import HF_CTC_VOCAB, melspectrogram, ConformerConfig, ConformerEncoder
from dataclasses import dataclass, field

HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

ConformerConfig.register_for_auto_class()
ConformerEncoder.register_for_auto_class()

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
HF_CTC_VOCAB

['',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ' ',
 '?',
 '_']

In [3]:
config = ConformerConfig(
    input_dim=80,
    output_dim=len(HF_CTC_VOCAB),
    time_reduction_stride=4,
    conformer_input_dim=144,
    conformer_ffn_dim=576,
    conformer_num_layers=8,
    conformer_num_heads=4,
    conformer_depthwise_conv_kernel_size=31,
    conformer_dropout=0.0,
    pad_token_id=len(HF_CTC_VOCAB) - 1,
    ctc_loss_reduction='mean',
    ctc_zero_infinity=True,
)

In [4]:
encoder = ConformerEncoder(config)

In [5]:
global_stats = torch_featurization.GlobalStatsNormalization('../../../malay-stats.json')

In [6]:
y, sr = malaya_speech.load('../../../speech/example-speaker/husein-zolkepli.wav')
y2, sr = malaya_speech.load('../../../speech/example-speaker/shafiqah-idayu.wav')

In [7]:
srs = [4400, 5100, 6000, 8000, 10000]

def downsample(y, sr):
    s_sr = random.choice(srs)
    y_ = malaya_speech.resample(y, sr, s_sr)
    return malaya_speech.resample(y_, s_sr, sr)

In [8]:
mel = torch_featurization.melspectrogram(y)
mel = torch_featurization.piecewise_linear_log(mel)
mel2 = torch_featurization.melspectrogram(y2)
mel2 = torch_featurization.piecewise_linear_log(mel2)

In [9]:
text = ['nama saya husein bin zolkepli', 'nama saya shafiqah idayu']
text = [[HF_CTC_VOCAB_REV[c] for c in t] for t in text]

In [10]:
@dataclass
class DataCollatorCTCWithPadding:
     def __call__(self, features):
        inputs = [f['inputs'] for f in features]
        lengths = torch.tensor([len(f['inputs']) for f in features])
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first = True)
        labels = [torch.tensor(f['labels']) for f in features]
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first = True, padding_value = -100)
        return {
            'inputs': inputs,
            'lengths': lengths,
            'labels': labels,
        }

In [11]:
collator = DataCollatorCTCWithPadding()

In [13]:
features = [
    {'inputs': mel, 'labels': text[0]},
    {'inputs': mel2, 'labels': text[1]}
]
batch = collator(features)
batch

{'inputs': tensor([[[ 7.5720,  8.8585, 11.1099,  ..., 14.2707, 14.0441, 14.6217],
          [11.6262, 12.9127, 15.0601,  ..., 14.3029, 13.7059, 14.6847],
          [15.4666, 16.7531, 17.1559,  ..., 12.7871, 13.4699, 14.2108],
          ...,
          [19.2897, 20.5761, 18.5907,  ..., 13.4430, 14.3961, 14.1888],
          [19.9591, 21.2456, 20.8646,  ..., 13.4250, 13.9565, 14.0654],
          [15.8479, 17.1344, 15.3638,  ..., 12.1170, 12.7841, 11.9480]],
 
         [[13.5830, 14.8695, 16.0703,  ..., 16.1097, 15.8851, 15.7696],
          [17.2102, 18.4967, 19.5340,  ..., 16.7923, 16.7401, 17.0030],
          [19.9245, 21.2110, 21.4556,  ..., 17.6996, 17.4917, 17.0634],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]),
 'lengths': tensor([564, 352]),
 'labels': tensor([[  14,    1,   13,    1,   37,   19,   

In [15]:
%%time

encoder(**batch)

CPU times: user 1min 31s, sys: 392 ms, total: 1min 32s
Wall time: 15.2 s


(tensor(12.7206, grad_fn=<MeanBackward0>),
 tensor([[[-0.2438,  0.0271, -1.7649,  ..., -0.2794, -0.4674, -0.0894],
          [ 0.2894,  0.1072, -1.9631,  ..., -0.4485, -0.7009, -0.1556],
          [ 0.0792, -0.1594, -1.7579,  ...,  0.1081, -0.4064, -0.1305],
          ...,
          [ 0.0305, -0.4869, -1.2690,  ..., -0.1699, -0.1268,  0.1189],
          [ 0.1104, -0.1468, -1.1248,  ..., -0.0366,  0.0917,  0.0816],
          [-0.0622, -0.1821, -1.5775,  ..., -0.1951, -0.1715,  0.1808]],
 
         [[-0.0265,  0.1000, -1.8042,  ..., -0.6123, -0.5839, -0.1483],
          [-0.3110, -0.0191, -1.4859,  ..., -0.2943, -0.1898, -0.2667],
          [-0.4550, -0.3622, -1.7746,  ..., -0.2279, -0.3193, -0.2250],
          ...,
          [ 0.2609,  0.7337, -0.9314,  ...,  0.4050,  0.0939, -0.2808],
          [-0.4696,  0.5692, -0.6225,  ...,  0.5003,  0.3942, -0.2715],
          [-0.4144,  0.1144, -0.7467,  ...,  0.8171, -0.1737, -0.2093]]],
        grad_fn=<AddBackward0>),
 tensor([141,  88]))

In [16]:
total_params = sum(
    param.numel() for param in encoder.parameters()
)
total_params

3937720

In [17]:
encoder.save_pretrained('./out')

In [18]:
!ls -lh out

total 16M
-rw-r--r-- 1 husein husein  600 Jan  27 17:51 config.json
-rw-rw-r-- 1 husein husein 2.5K Jan  27 17:51 conformer.py
-rw-r--r-- 1 husein husein  16M Jan  27 17:51 model.safetensors


In [19]:
encoder.push_to_hub('huseinzol05/conformer-tiny', safe_serialization = True)

model.safetensors:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/conformer-tiny/commit/a406da2418c6cf1ff251ca6a585c18c3e5465682', commit_message='Upload ConformerEncoder', commit_description='', oid='a406da2418c6cf1ff251ca6a585c18c3e5465682', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
from transformers import AutoConfig, AutoModel

In [21]:
model = AutoModel.from_pretrained('huseinzol05/conformer-tiny', trust_remote_code = True)

config.json:   0%|          | 0.00/600 [00:00<?, ?B/s]

conformer.py:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/huseinzol05/conformer-tiny:
- conformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

In [22]:
model.config

ConformerConfig {
  "_name_or_path": "huseinzol05/conformer-tiny",
  "architectures": [
    "ConformerEncoder"
  ],
  "auto_map": {
    "AutoConfig": "huseinzol05/conformer-tiny--conformer.ConformerConfig",
    "AutoModel": "huseinzol05/conformer-tiny--conformer.ConformerEncoder"
  },
  "conformer_depthwise_conv_kernel_size": 31,
  "conformer_dropout": 0.1,
  "conformer_ffn_dim": 576,
  "conformer_input_dim": 144,
  "conformer_num_heads": 4,
  "conformer_num_layers": 8,
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "input_dim": 80,
  "model_type": "conformer",
  "output_dim": 40,
  "pad_token_id": 39,
  "time_reduction_stride": 4,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2"
}

In [23]:
%%time

encoder(**batch)

CPU times: user 1min 31s, sys: 320 ms, total: 1min 31s
Wall time: 15.6 s


(tensor(12.6361, grad_fn=<MeanBackward0>),
 tensor([[[ 0.2286,  0.0527, -1.7409,  ..., -0.2106, -0.2741,  0.0191],
          [ 0.0960,  0.2160, -1.7904,  ...,  0.1017, -0.6706, -0.3078],
          [ 0.1767,  0.1828, -1.2174,  ..., -0.1274, -0.2793, -0.2420],
          ...,
          [-0.0634, -0.1707, -1.4293,  ..., -0.4739, -0.1507, -0.0820],
          [-0.2105, -0.0901, -1.5113,  ..., -0.2044, -0.0864,  0.3093],
          [ 0.2034, -0.0491, -1.4539,  ...,  0.0712, -0.5379, -0.2450]],
 
         [[ 0.2148,  0.1968, -1.6583,  ..., -0.6372, -0.2747, -0.2175],
          [-0.3147,  0.0349, -1.9579,  ..., -0.3642, -0.4155, -0.2894],
          [ 0.2023,  0.3942, -1.9186,  ..., -0.1949, -0.2248, -0.1704],
          ...,
          [ 0.2204, -0.0192, -0.5200,  ...,  0.9203,  0.4959,  0.5512],
          [-0.2399,  0.0558, -1.2872,  ...,  0.6316,  0.2812, -0.7079],
          [ 0.3227,  0.2585, -1.1684,  ...,  0.5002,  0.1720, -0.4194]]],
        grad_fn=<AddBackward0>),
 tensor([141,  88]))