In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import transformers
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
    set_seed,
    AutoModelForCTC,
    TFWav2Vec2ForCTC,
    TFWav2Vec2PreTrainedModel,
    Wav2Vec2PreTrainedModel,
)

In [4]:
import string
import json

CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']

In [5]:
vocab_dict = {v: k for k, v in enumerate(CTC_VOCAB)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict

{'': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 '|': 37}

In [6]:
with open("vocab-huggingface-ctc.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [7]:
tokenizer = Wav2Vec2CTCTokenizer('vocab-huggingface-ctc.json')

In [8]:
tokenizer('saya suka')

{'input_ids': [19, 1, 25, 1, 37, 19, 21, 11, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [10]:
import soundfile as sf
import librosa
from glob import glob
wavs = glob('*.wav')
wavs[:1]

['125-y_.wav']

In [11]:
y, _ = librosa.load(wavs[0], sr = 16000)

In [12]:
processor(y)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'input_values': [array([-5.7139091e-02,  2.1197377e-01,  4.7779882e-01, ...,
       -1.4552543e+00,  6.8315369e-01,  3.5841382e-05], dtype=float32)], 'attention_mask': [array([1, 1, 1, ..., 1, 1, 1], dtype=int32)]}

In [13]:
with processor.as_target_processor():
    t = processor('hello saya busuk').input_ids
print(t)

[8, 5, 12, 12, 15, 37, 19, 1, 25, 1, 37, 2, 21, 19, 21, 11]


In [14]:
model = AutoModelForCTC.from_pretrained(
    'facebook/wav2vec2-xls-r-300m',
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_hid.weight', 'project_hid.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

In [26]:
model.save_pretrained('out-hf-wav2vec2')

In [27]:
model_tf = TFWav2Vec2ForCTC.from_pretrained(
    './out-hf-wav2vec2',
    from_pt=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)


TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
All PyTorch model weights were used when initializing TFWav2Vec2ForCTC.

All the weights of TFWav2Vec2ForCTC were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForCTC for predictions without further training.


In [None]:
model.freeze_feature_encoder()

In [17]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.AutoProcessor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = "longest"
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"], "input_ids": feature["labels"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        print(input_features, label_features)

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorCTCWithPadding(processor=processor)

In [19]:
y_ = processor(y)
with processor.as_target_processor():
    t = processor('hello saya busuk').input_ids
y_['labels'] = t

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [None]:
y_

In [None]:
y_['input_values'][0]

In [None]:
data_collator([y_])

In [32]:
import numpy as np
import tensorflow as tf

In [29]:
o_pt = model(torch.from_numpy(np.array(y_['input_values'])))
o_pt = o_pt.logits.detach().numpy()

In [30]:
o_pt

array([[[ 0.00495598,  0.05559233,  0.14905328, ...,  0.13361752,
         -0.08622889, -0.03216675],
        [-0.03061592,  0.04589646,  0.08363733, ..., -0.0310466 ,
          0.2074907 , -0.08179823],
        [-0.02435615,  0.02852438,  0.06701366, ..., -0.04197481,
          0.24878341, -0.07059664],
        ...,
        [ 0.00149082, -0.01015435,  0.08006284, ..., -0.09033568,
          0.2353898 , -0.00996168],
        [-0.02471039, -0.00597402,  0.07611306, ..., -0.04074758,
          0.11411414,  0.0073104 ],
        [ 0.03524744, -0.00614068,  0.11885092, ...,  0.0342891 ,
         -0.03239059,  0.0335536 ]]], dtype=float32)

In [33]:
o_tf = model_tf(tf.convert_to_tensor(np.array(y_['input_values'])))
o_tf = o_tf.logits.numpy()

In [34]:
np.argmax(o_pt, axis = -1)

array([[30, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 28, 28, 36, 10, 36, 36, 36, 36, 36, 36, 36, 30, 10,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 30, 30, 30, 27, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 30, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 28, 28, 30, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 28, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 22, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 30, 30, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 12, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 

In [35]:
np.argmax(o_tf, axis = -1)

array([[30, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 28, 28, 36, 10, 36, 36, 36, 36, 36, 36, 36, 30, 10,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 30, 30, 30, 27, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 30, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 28, 28, 30, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 28, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 22, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 30, 30, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 12, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 