In [2]:
from transformers import AutoModel, AutoProcessor
import torch
import torchaudio

# load audio
wav, sr = torchaudio.load("dataset/test_wavs/bronya.wav")
# resample if necessary
wav = torchaudio.functional.resample(wav, sr, 16000)

# load model and processor
processor = AutoProcessor.from_pretrained("waveletdeboshir/gigaam-ctc", trust_remote_code=True)
model = AutoModel.from_pretrained("waveletdeboshir/gigaam-ctc", trust_remote_code=True)
model.eval()

input_features = processor(wav[0], sampling_rate=16000, return_tensors="pt", padding=True)

# predict
with torch.no_grad():
    logits = model(**input_features).logits
# greedy decoding
greedy_ids = logits.argmax(dim=-1)
# decode token ids to text
transcription = processor.batch_decode(greedy_ids)[0]


You are using a model of type gigaam-ctc to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [45]:
input_features

{'input_features': tensor([[[-17.0600,  -8.9474,  -9.2287,  ...,  -8.3567,  -8.5132,  -7.9249],
         [-11.9721,  -9.2135,  -9.5129,  ...,  -8.6416,  -8.7919,  -8.1884],
         [-10.6638, -11.7523, -13.9064,  ..., -13.2384, -12.2041, -10.6144],
         ...,
         [-16.0292, -17.7381, -17.8500,  ..., -16.6101, -16.8950, -15.9967],
         [-16.3504, -18.3377, -18.0335,  ..., -16.9564, -17.1942, -16.4797],
         [-16.4160, -17.6819, -17.7075,  ..., -17.2120, -17.6381, -17.3802]]]), 'input_lengths': tensor([1112])}

In [48]:
encoder = model.model.encoder

encoder.eval()

out = encoder(input_features['input_features'], length=input_features['input_lengths'])

out[0].shape

torch.Size([1, 768, 278])

In [62]:
# Load model directly
from transformers import AutoProcessor, AutoModel, Wav2Vec2FeatureExtractor

# processor = AutoProcessor.from_pretrained("facebook/hubert-large-ll60k")
model_hubert = AutoModel.from_pretrained("facebook/hubert-large-ll60k")
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-large-ll60k", trust_remote_code=True)

In [63]:
alt_inputs = processor(
    wav[0],
    return_tensors='pt',
    return_attention_mask=True,
    padding=True,
    sampling_rate=16000
)

In [64]:
alt_inputs.input_values.shape
feature_lens = alt_inputs.data['attention_mask'].sum(-1) // 320  # frame rate of hubert is 50 Hz

In [65]:
output_hubert = model_hubert(alt_inputs.input_values, attention_mask=alt_inputs.attention_mask)

In [66]:
last_hidden_states = output_hubert.last_hidden_state
last_hidden_states = last_hidden_states[:, :feature_lens.max(), :]
feature_lens = feature_lens.clamp(max=last_hidden_states.size(1))
last_hidden_states = last_hidden_states.transpose(1, 2)

In [80]:
feature_lens

tensor([555])

In [79]:
last_hidden_states.shape

torch.Size([1, 1024, 555])