In [None]:
import numpy as np
# import os
import glob
import torch
import torchaudio
import librosa

#### This notebook also does ASR, transcripts saved in the embeddings directory

From 
- https://pytorch.org/audio/0.10.0/pipelines.html#wav2vec-2-0-hubert-representation-learning 
- https://pytorch.org/audio/0.10.0/pipelines.html#wav2vec-2-0-hubert-fine-tuned-asr 
- https://pytorch.org/audio/main/tutorials/asr_inference_with_ctc_decoder_tutorial.html 

In [None]:
# Load the model
bundle = torchaudio.pipelines.HUBERT_ASR_LARGE # or HUBERT_ASR_XLARGE 

model = bundle.get_model()
model = model.eval()

# # Set to GPU or CPU
# device = "cuda"
# model = model.eval()
# model = model.to(device)

# encoder_embed_dim = 768 # HUBERT_BASE (see https://pytorch.org/audio/0.10.0/_modules/torchaudio/pipelines/_wav2vec2.html#Wav2Vec2Bundle)
# encoder_embed_dim = 1024 # HUBERT_LARGE

#### Register forward hook to get the output of the encoder and acoustic features only

The function extract_features() returns too many layers

In [None]:
def get_features(name):
    def hook(model, input, output):
        if name == "wave_encoder":
            output, _ = output # second output is optional 
        features[name] = output.detach().numpy().squeeze() # type: ignore (pylance bug)
    return hook

model.feature_extractor.register_forward_hook(get_features("wave_encoder"))
model.encoder.register_forward_hook(get_features("transformer"))

In [None]:
# test audio loading before building the CTC decoder
test_audio = glob.glob("/homes/lm004/commercials/annotated_commercials/*.mp3")[0]
_, _ = librosa.core.load(test_audio, sr=bundle.sample_rate, res_type='kaiser_fast')

In [None]:
# if we import this stuff at the top, it will break librosa.core.load() for some obscure reasons
from torchaudio.models.decoder import ctc_decoder
from torchaudio.models.decoder import download_pretrained_files

files = download_pretrained_files("librispeech-4-gram")

# print(files)
LM_WEIGHT = 3.23
WORD_SCORE = -0.26

beam_search_decoder = ctc_decoder(
    lexicon=files.lexicon,
    tokens=files.tokens,
    lm=files.lm,
    nbest=3,
    beam_size=1500,
    lm_weight=LM_WEIGHT,
    word_score=WORD_SCORE,
)

In [None]:
for audio_fn in glob.glob("/homes/lm004/commercials/annotated_commercials/*.mp3"):
    features = {}

    y, _ = librosa.core.load(audio_fn, sr=bundle.sample_rate, res_type='kaiser_fast')
    y = torch.unsqueeze(torch.from_numpy(y), dim=0) # .to(device)

    emission, _ = model(y) # type: ignore (pylance bug)
    emission = emission.detach() # type: ignore (pylance bug)

    beam_search_result = beam_search_decoder(emission)
    beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()

    stimulus_id = audio_fn.split('/')[-1].replace('_trimmed.mp3','')

    with open(f"embeddings_hubert/{stimulus_id}.txt", "w") as text_file:
        text_file.write(beam_search_transcript)

    # NB: not saving for safety, uncomment to save

    # np.save(open(f"embeddings_hubert/{stimulus_id}_wave_encoder.npy", 'wb'), features['wave_encoder'])
    # np.save(open(f"embeddings_hubert/{stimulus_id}_transformer.npy", 'wb'), features['transformer'])

    framesrate = 0.02 # 20ms frames rate, see paper
    assert int(y.shape[1]//(framesrate*bundle.sample_rate)) == features['transformer'].shape[0]
    assert int(y.shape[1]//(framesrate*bundle.sample_rate)) == features['wave_encoder'].shape[0]