In [10]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, HubertForCTC

# Load the pretrained HUBERT model and the corresponding processor
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")

# Function to transcribe audio using the Hubert model
def transcribe_audio(file_path):
    # Load audio file
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Ensure the audio file is in the correct format
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    
    # Ensure the waveform is 1D (single channel)
    if waveform.ndim > 1:
        waveform = waveform.mean(dim=0)  # Convert to mono by averaging channels

    # Preprocess the audio file
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Decode the output
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    
    return transcription[0]

# Example usage
audio_file = "corpus/clips/common_voice_pl_20547774.mp3"
transcription = transcribe_audio(audio_file)
print("Transcription:", transcription)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

Transcription: IDO HIEGO U NACHUCIAN SAMOTENE ICHERPIOS PAVODUS FOYI SA MOTNASHCI
