In [1]:
from torchaudio.models import Conformer
from malaya_speech.utils import torch_featurization
import torch
import malaya_speech
from typing import List, Tuple, Optional
from torchaudio.models.rnnt import _TimeReduction
from malaya_speech.utils.char import HF_CTC_VOCAB
from transformers import PretrainedConfig, PreTrainedModel

HF_CTC_VOCAB = HF_CTC_VOCAB + ['_']
HF_CTC_VOCAB_INDEX = {no: c for no, c in enumerate(HF_CTC_VOCAB)}
HF_CTC_VOCAB_REV = {v: k for k, v in HF_CTC_VOCAB_INDEX.items()}

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [2]:
class ConformerConfig(PretrainedConfig):
    model_type = 'conformer'

class ConformerEncoder(PreTrainedModel):
    
    def __init__(
        self,
        config,
    ) -> None:
        super().__init__(config)
        self.time_reduction = _TimeReduction(config.time_reduction_stride)
        self.input_linear = torch.nn.Linear(
            config.input_dim * config.time_reduction_stride, 
            config.conformer_input_dim)
        self.conformer = Conformer(
            num_layers=config.conformer_num_layers,
            input_dim=config.conformer_input_dim,
            ffn_dim=config.conformer_ffn_dim,
            num_heads=config.conformer_num_heads,
            depthwise_conv_kernel_size=config.conformer_depthwise_conv_kernel_size,
            dropout=config.conformer_dropout,
            use_group_norm=True,
            convolution_first=True,
        )
        self.output_linear = torch.nn.Linear(config.conformer_input_dim, config.output_dim)

    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        time_reduction_out, time_reduction_lengths = self.time_reduction(input, lengths)
        input_linear_out = self.input_linear(time_reduction_out)
        x, lengths = self.conformer(input_linear_out, time_reduction_lengths)
        output_linear_out = self.output_linear(x)
        return output_linear_out, lengths

In [3]:
config = ConformerConfig(
    input_dim=80,
    output_dim=len(HF_CTC_VOCAB),
    time_reduction_stride=4,
    conformer_input_dim=144,
    conformer_ffn_dim=576,
    conformer_num_layers=2,
    conformer_num_heads=4,
    conformer_depthwise_conv_kernel_size=31,
    conformer_dropout=0.1
)

In [4]:
encoder = ConformerEncoder(config)

In [5]:
global_stats = torch_featurization.GlobalStatsNormalization('malay-stats.json')

In [6]:
y, sr = malaya_speech.load('speech/example-speaker/husein-zolkepli.wav')
y.shape[0] / sr

5.630625

In [7]:
mel = torch_featurization.melspectrogram(y)
mel = torch_featurization.piecewise_linear_log(mel)

In [8]:
%%time

logits, lengths = encoder(mel.unsqueeze(0), torch.tensor([mel.shape[0]]))

CPU times: user 4.67 s, sys: 36.3 ms, total: 4.71 s
Wall time: 868 ms


In [9]:
text = ['nama saya husein bin zolkepli']
text = [[HF_CTC_VOCAB_REV[c] for c in t] for t in text]
labels = torch.tensor(text)

In [10]:
from torch import nn

In [11]:
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

In [12]:
log_probs.shape

torch.Size([141, 1, 40])

In [13]:
logits.shape

torch.Size([1, 141, 40])

In [14]:
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)

In [15]:
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

with torch.backends.cudnn.flags(enabled=False):
    loss = nn.functional.ctc_loss(
        log_probs,
        flattened_targets,
        lengths,
        target_lengths,
        blank=len(HF_CTC_VOCAB_INDEX) - 1,
        reduction='mean',
        zero_infinity=True,
    )
    
loss

tensor(11.8623, grad_fn=<MeanBackward0>)

In [16]:
total_params = sum(
    param.numel() for param in encoder.parameters()
)
total_params

1023448

In [17]:
encoder.save_pretrained('./out')

In [18]:
!ls -lh out

total 4.0M
-rw-r--r-- 1 husein husein  403 Jan  17 13:06 config.json
-rw-r--r-- 1 husein husein 4.0M Jan  17 13:06 model.safetensors
