In [59]:
from transformers import AutoModel, AutoProcessor
import torch
import torchaudio

# load audio
wav, sr = torchaudio.load("dataset/test_wavs/bronya.wav")
# resample if necessary
wav = torchaudio.functional.resample(wav, sr, 16000)

# load model and processor
processor = AutoProcessor.from_pretrained("waveletdeboshir/gigaam-ctc", trust_remote_code=True)
model = AutoModel.from_pretrained("waveletdeboshir/gigaam-ctc", trust_remote_code=True)
model.eval()

input_features = processor(wav[0], sampling_rate=16000, return_tensors="pt", padding=True)

# predict
with torch.no_grad():
    logits = model(**input_features).logits
# greedy decoding
greedy_ids = logits.argmax(dim=-1)
# decode token ids to text
transcription = processor.batch_decode(greedy_ids)[0]


You are using a model of type gigaam-ctc to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


In [60]:
input_features['input_features'].shape

torch.Size([1, 64, 1112])

In [83]:
model

GigaAMCTCHF(
  (model): GigaAMCTC(
    (encoder): ConformerEncoder(
      (pre_encode): StridingSubsampling(
        (out): Linear(in_features=12288, out_features=768, bias=True)
        (conv): Sequential(
          (0): Conv2d(1, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (1): ReLU()
          (2): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (3): ReLU()
        )
      )
      (pos_enc): RotaryPositionalEmbedding()
      (layers): ModuleList(
        (0-15): 16 x ConformerLayer(
          (norm_feed_forward1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (feed_forward1): ConformerFeedForward(
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (activation): SiLU()
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (norm_conv): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (conv): ConformerConvolution(
      

In [75]:
out = model(**input_features).logits

In [81]:
out.shape
# Load model directly

torch.Size([1, 278, 34])

In [82]:
out.size(-2)

278

In [62]:
# Load model directly
from transformers import AutoProcessor, AutoModel, Wav2Vec2FeatureExtractor

# processor = AutoProcessor.from_pretrained("facebook/hubert-large-ll60k")
model_hubert = AutoModel.from_pretrained("facebook/hubert-large-ll60k")
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-large-ll60k", trust_remote_code=True)

In [63]:
alt_inputs = processor(
    wav[0],
    return_tensors='pt',
    return_attention_mask=True,
    padding=True,
    sampling_rate=16000
)

In [64]:
alt_inputs.input_values.shape
feature_lens = alt_inputs.data['attention_mask'].sum(-1) // 320  # frame rate of hubert is 50 Hz

In [65]:
output_hubert = model_hubert(alt_inputs.input_values, attention_mask=alt_inputs.attention_mask)

In [66]:
last_hidden_states = output_hubert.last_hidden_state
last_hidden_states = last_hidden_states[:, :feature_lens.max(), :]
feature_lens = feature_lens.clamp(max=last_hidden_states.size(1))
last_hidden_states = last_hidden_states.transpose(1, 2)

In [80]:
feature_lens

tensor([555])

In [79]:
last_hidden_states.shape

torch.Size([1, 1024, 555])