In [2]:
!nvidia-smi

Wed Apr  6 14:59:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.00    Driver Version: 470.82.00    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   58C    P8    20W /  N/A |    718MiB / 16125MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from decoder import *
import utils
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
# load the finetuned model and the processor
model = Wav2Vec2ForCTC.from_pretrained('./saved_model/')
processor = Wav2Vec2Processor.from_pretrained('./processor/')

In [4]:
vocab_dict = processor.tokenizer.get_vocab()
print(f'Vocab Dict: {vocab_dict}')

Vocab Dict: {'G': 0, 'A': 1, 'E': 2, '-': 3, "'": 4, 'Y': 5, 'S': 6, 'K': 7, 'O': 8, 'L': 9, 'Z': 10, 'H': 11, 'Q': 12, 'W': 13, 'T': 14, 'J': 15, 'B': 16, 'D': 17, 'U': 18, 'M': 19, 'F': 21, 'C': 22, 'P': 23, 'I': 24, '#': 25, 'R': 26, 'V': 27, 'X': 28, 'N': 29, '|': 20, '[UNK]': 30, '[PAD]': 31, '<s>': 32, '</s>': 33}


In [5]:
sort_vocab = sorted((value, key) for (key,value) in vocab_dict.items())
print(sort_vocab)

[(0, 'G'), (1, 'A'), (2, 'E'), (3, '-'), (4, "'"), (5, 'Y'), (6, 'S'), (7, 'K'), (8, 'O'), (9, 'L'), (10, 'Z'), (11, 'H'), (12, 'Q'), (13, 'W'), (14, 'T'), (15, 'J'), (16, 'B'), (17, 'D'), (18, 'U'), (19, 'M'), (20, '|'), (21, 'F'), (22, 'C'), (23, 'P'), (24, 'I'), (25, '#'), (26, 'R'), (27, 'V'), (28, 'X'), (29, 'N'), (30, '[UNK]'), (31, '[PAD]'), (32, '<s>'), (33, '</s>')]


In [6]:
# Lower case ALL letters
vocab = []
for _, token in sort_vocab:
    vocab.append(token)
    
print(vocab)

['G', 'A', 'E', '-', "'", 'Y', 'S', 'K', 'O', 'L', 'Z', 'H', 'Q', 'W', 'T', 'J', 'B', 'D', 'U', 'M', '|', 'F', 'C', 'P', 'I', '#', 'R', 'V', 'X', 'N', '[UNK]', '[PAD]', '<s>', '</s>']


In [7]:
processor.tokenizer.word_delimiter_token

'|'

In [8]:
# replace the word delimiter with a white space since the white space is used by the decoders
# only can run once
vocab[vocab.index(processor.tokenizer.word_delimiter_token)] = ' '
print(vocab)

['G', 'A', 'E', '-', "'", 'Y', 'S', 'K', 'O', 'L', 'Z', 'H', 'Q', 'W', 'T', 'J', 'B', 'D', 'U', 'M', ' ', 'F', 'C', 'P', 'I', '#', 'R', 'V', 'X', 'N', '[UNK]', '[PAD]', '<s>', '</s>']


In [20]:
len(vocab)

34

In [9]:
# define the lm path
lm_path = "lm/4gram_big.arpa.gz" 

In [10]:
# alpha, beta, and beam_wdith SHOULD be tuned on the dev-set to get the best settings
# Feel free to check other inputs of the BeamCTCDecoder
alpha=0
beta=0
beam_width = 1024

beam_decoder = BeamCTCDecoder(vocab, lm_path=lm_path,
                                 alpha=alpha, beta=beta,
                                 cutoff_top_n=40, cutoff_prob=1.0,
                                 beam_width=beam_width, num_processes=16,
                                 blank_index=vocab.index(processor.tokenizer.pad_token))


greedy_decoder = GreedyDecoder(vocab, blank_index=vocab.index(processor.tokenizer.pad_token))

Loading the LM will be faster if you build a binary file.
Reading lm/4gram_big.arpa.gz
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [11]:
# load test audio file
audio_files_paths = ['./datasets/magister_data_flac_16000/test/11039/2614000/11039-2614000-0000.flac', './datasets/magister_data_flac_16000/test/11039/2614000/11039-2614000-0001.flac']

In [12]:
print(f'Load audio files: "{audio_files_paths}"')
batch_audio_files, sampling_rate = utils.load_audio_files(audio_files_paths)



Load audio files: "['./datasets/magister_data_flac_16000/test/11039/2614000/11039-2614000-0000.flac', './datasets/magister_data_flac_16000/test/11039/2614000/11039-2614000-0001.flac']"


In [13]:
print('Get logits from the Wav2Vec2ForCTC model....')
logits, max_signal_length = utils.get_logits(batch_audio_files, model, processor, device)



Get logits from the Wav2Vec2ForCTC model....


In [14]:
logits

tensor([[[-2.2786, -0.8382, -0.4542,  ..., -1.3852, -4.3337,  9.8525],
         [-2.2902, -0.8152, -0.4573,  ..., -1.3913, -4.3236,  9.8550],
         [-2.2897, -0.8344, -0.4595,  ..., -1.3903, -4.3181,  9.8491],
         ...,
         [-2.3060, -0.9024, -0.4289,  ..., -1.4616, -4.3394,  9.8318],
         [-2.2989, -0.9266, -0.4229,  ..., -1.4819, -4.3421,  9.8193],
         [-2.3130, -0.8996, -0.4738,  ..., -1.4627, -4.3276,  9.8367]],

        [[-2.2377, -0.7514, -0.5989,  ..., -1.3682, -4.4281,  9.9067],
         [-2.0940, -0.9037, -0.5309,  ..., -1.3416, -4.4453,  9.7667],
         [-2.2401, -0.7409, -0.6070,  ..., -1.3521, -4.4228,  9.8985],
         ...,
         [-2.2740, -0.8447, -0.5058,  ..., -1.3561, -4.4058,  9.8855],
         [-2.2687, -0.8514, -0.5020,  ..., -1.3600, -4.4019,  9.8827],
         [-2.2638, -0.8597, -0.5046,  ..., -1.3732, -4.3914,  9.8798]]],
       device='cuda:0')

In [21]:
logits.shape

torch.Size([2, 195, 32])

In [19]:
logits[0][3]

tensor([-2.3117, -0.8338, -0.4529, -4.2246, -4.4419, -2.5173, -1.2600, -2.6937,
        -0.6669, -1.7660, -2.5445, -1.8209, -3.8375, -1.4566, -0.9534, -3.6862,
        -2.6184, -2.0723, -1.9897, -2.3756, -0.3536, -2.5347, -1.5390, -1.5076,
        -0.9918, -0.3115, -1.2481, -3.2484, -3.4582, -1.3820, -4.3160,  9.8487],
       device='cuda:0')

In [None]:
print('Decoding using the Beam Search Decoder....')
beam_decoded_output, beam_decoded_offsets = beam_decoder.decode(logits)


In [None]:
print('Decoding using the Greedy Decoder....')
greedy_decoded_output, greedy_decoded_offsets = greedy_decoder.decode(logits)

In [None]:
print('Printing the output of the first audio file...\n')

print('Greedy Decoding Output:', greedy_decoded_output[1][0])
print()
print('#'*85)
print()
print('Beam Search Decoding Output:', beam_decoded_output[1][0]) # print the top prediction of the beam search

print('Compute Segments....')
batch_segments_list_greedy = utils.get_segments(logits, greedy_decoded_output, max_signal_length, sampling_rate, vocab)
batch_segments_list_beam = utils.get_segments(logits, beam_decoded_output, max_signal_length, sampling_rate, vocab)

print('Printing the first segment (word) of the first audio file...')
print()
print('#'*85)
print()
print('Greedy Decoding Output:', batch_segments_list_greedy[1][0])
print()
print('Beam Search Decoding Output:', batch_segments_list_beam[1][0])

print('Done!!')