In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import transformers
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
    set_seed,
    AutoModelForCTC,
    TFWav2Vec2ForCTC,
    TFWav2Vec2PreTrainedModel,
    Wav2Vec2PreTrainedModel,
)

In [4]:
import string
import json

CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']

In [5]:
vocab_dict = {v: k for k, v in enumerate(CTC_VOCAB)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open("ctc-vocab.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer(
    "ctc-vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [6]:
import soundfile as sf
import librosa
from glob import glob
import numpy as np

In [7]:
y, _ = librosa.load('shafiqah-idayu.wav', sr = 16000)
y2, _ = librosa.load('husein-zolkepli.wav', sr = 16000)

In [8]:
def norm_audio(x):
    return (x - x.mean()) / np.sqrt(x.var() + 1e-7)

def sequence_1d(
    seq, maxlen=None, padding: str = 'post', pad_int=0, return_len=False
):
    if padding not in ['post', 'pre']:
        raise ValueError('padding only supported [`post`, `pre`]')

    if not maxlen:
        maxlen = max([len(s) for s in seq])

    padded_seqs, length = [], []
    for s in seq:
        if isinstance(s, np.ndarray):
            s = s.tolist()
        if padding == 'post':
            padded_seqs.append(s + [pad_int] * (maxlen - len(s)))
        if padding == 'pre':
            padded_seqs.append([pad_int] * (maxlen - len(s)) + s)
        length.append(len(s))
    if return_len:
        return np.array(padded_seqs), length
    return np.array(padded_seqs)

batch, lens = sequence_1d([y,y2],return_len=True)
attentions = [[1] * l for l in lens]
attentions = sequence_1d(attentions)
normed_input_values = []

for vector, length in zip(batch, attentions.sum(-1)):
    normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
    if length < normed_slice.shape[0]:
        normed_slice[length:] = 0.0

    normed_input_values.append(normed_slice)
    
normed_input_values = np.array(normed_input_values)

In [10]:
!ls wav2vec2-mixed/checkpoint-*

wav2vec2-mixed/checkpoint-52500:
config.json		  pytorch_model.bin  scheduler.pt
optimizer.pt		  rng_state.pth      trainer_state.json
preprocessor_config.json  scaler.pt	     training_args.bin

wav2vec2-mixed/checkpoint-55000:
config.json		  pytorch_model.bin  scheduler.pt
optimizer.pt		  rng_state.pth      trainer_state.json
preprocessor_config.json  scaler.pt	     training_args.bin

wav2vec2-mixed/checkpoint-57500:
config.json		  pytorch_model.bin  scheduler.pt
optimizer.pt		  rng_state.pth      trainer_state.json
preprocessor_config.json  scaler.pt	     training_args.bin


In [11]:
model = AutoModelForCTC.from_pretrained(
    './wav2vec2-mixed/checkpoint-57500',
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
)

In [12]:
model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [13]:
o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)), 
             attention_mask = torch.from_numpy(attentions))

In [14]:
o_pt = o_pt.logits.detach().numpy()

In [15]:
o_pt

array([[[-16.23078   ,  -2.1071203 ,  -3.3782234 , ...,  -0.91776204,
         -16.008984  ,   8.199033  ],
        [-16.324038  ,  -2.164919  ,  -3.44497   , ...,  -0.9603425 ,
         -16.101015  ,   8.311853  ],
        [-16.363346  ,  -2.1864486 ,  -3.4725559 , ...,  -0.9760332 ,
         -16.13961   ,   8.3553705 ],
        ...,
        [-16.16172   ,  -2.0588074 ,  -3.3299544 , ...,  -0.8789096 ,
         -15.94082   ,   8.109013  ],
        [-16.16172   ,  -2.0588074 ,  -3.3299544 , ...,  -0.8789096 ,
         -15.94082   ,   8.109013  ],
        [-16.16172   ,  -2.0588074 ,  -3.3299544 , ...,  -0.8789096 ,
         -15.94082   ,   8.109013  ]],

       [[-16.297354  ,  -2.1470547 ,  -3.4280746 , ...,  -0.94634223,
         -16.07446   ,   8.2803    ],
        [-16.391087  ,  -2.2045054 ,  -3.4949186 , ...,  -0.9892204 ,
         -16.166883  ,   8.393223  ],
        [-16.399641  ,  -2.2067604 ,  -3.5011625 , ...,  -0.9908891 ,
         -16.175293  ,   8.400339  ],
        ...,


In [16]:
pred_ids = np.argmax(o_pt, axis = -1)

In [17]:
tokenizer.batch_decode(pred_ids)

['nama saya syafikah idayu', 'testing nama saya husin bin zolkepli']

In [18]:
model.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='malay-huggingface')

Upload file pytorch_model.bin:  99%|█████████▉| 1.17G/1.18G [02:28<00:00, 8.55MB/s]remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed
   cc0be82..8fe3344  main -> main

Upload file pytorch_model.bin: 100%|██████████| 1.18G/1.18G [02:32<00:00, 8.26MB/s]


'https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed/commit/8fe3344caaa550e5514236a59a9ba6949b923037'

In [19]:
model_tf = TFWav2Vec2ForCTC.from_pretrained(
    './wav2vec2-mixed/checkpoint-57500',
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
    from_pt=True,
)


TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
All PyTorch model weights were used when initializing TFWav2Vec2ForCTC.

All the weights of TFWav2Vec2ForCTC were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForCTC for predictions without further training.


In [21]:
o = model_tf(normed_input_values, attention_mask = attentions)

In [22]:
pred_ids = np.argmax(o.logits.numpy(), axis = -1)
tokenizer.batch_decode(pred_ids)

['nama saya syafikah idayu', 'testing nama saya husin bin zolkepli']

In [23]:
model_tf.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='malay-huggingface')

Upload file tf_model.h5: 100%|█████████▉| 1.18G/1.18G [02:32<00:00, 8.83MB/s]remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed
   8fe3344..230c71e  main -> main

Upload file tf_model.h5: 100%|██████████| 1.18G/1.18G [02:36<00:00, 8.06MB/s]


'https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed/commit/230c71ee381907bc5ea96809e40cdeb2250f416b'

In [37]:
processor.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='malay-huggingface')

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed
   c4c38fd..816c09b  main -> main



'https://huggingface.co/malay-huggingface/wav2vec2-xls-r-300m-mixed/commit/816c09b7362218b0e2a7ce7e79391f660407648c'