In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, HubertForCTC, Wav2Vec2ForCTC

# Load the pretrained HUBERT model and the corresponding processor
model_name = "facebook/wav2vec2-large-960h"
modelH = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
modelW = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)


# Function to transcribe audio using the Hubert model
def transcribe_audio(file_path):
    # Load audio file
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Ensure the audio file is in the correct format
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    
    # Ensure the waveform is 1D (single channel)
    if waveform.ndim > 1:
        waveform = waveform.mean(dim=0)  # Convert to mono by averaging channels

    # Preprocess the audio file
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Perform inference
    with torch.no_grad():
        logits = modelH(**inputs).logits
    
    # Decode the output
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    
    return transcription[0]

# Example usage
audio_file = "corpus/clips/common_voice_pl_20547774.mp3"
transcription = transcribe_audio(audio_file)
print("Transcription:", transcription)

In [None]:
from datasets import load_dataset

cv = load_dataset("mozilla-foundation/common_voice_17_0", "pl", split="train")

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

# Tokenize and preprocess the dataset
dataset = cv.map(speech_file_to_array_fn)

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_steps=1000,
    save_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=1000,
    save_total_limit=3,
)

# Initialize Trainer
trainer = Trainer(
    model=modelW,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
)

In [None]:
trainer.train()

In [None]:
import IPython

sample = cv[1]  # Change index as needed

print(f"Sentence: {sample['sentence']}")

audio_file = sample['path']
transcription = transcribe_audio(audio_file)
print("Transcription:", transcription)
IPython.display.Audio(data=audio_file, rate=16000)