In [None]:
%pip uninstall torch torchaudio -y
%pip install torch==2.5.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
%pip install 'accelerate>=0.26.0'
%pip install librosa soundfile

In [1]:
import os
import torch
import torchaudio
from transformers import AutoProcessor, WhisperForConditionalGeneration
from tqdm.notebook import tqdm

In [2]:
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
).to("cuda")

Tensor Parallel requires torch.distributed to be initialized first.


In [3]:
def mp3_2_waveform(mp3_path, target_sr = 16000):
    info = torchaudio.info(mp3_path)
    
    waveform, sample_rate = torchaudio.load(mp3_path, num_frames=info.num_frames)
    duration = waveform.shape[1] / sample_rate

    if sample_rate != target_sr: #resample to 16k
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
        sample_rate = target_sr

    if waveform.shape[0] > 1: #mono
        waveform = torch.mean(waveform, dim=0)

    return waveform, sample_rate, duration


def inference(model, processor, waveform, sample_rate, batch_duration=30):   
    batch_samples = batch_duration * sample_rate
    audio_array = waveform.numpy()
    total_batches = (len(audio_array) + batch_samples - 1) // batch_samples
    
    transcription = []
    
    for i in tqdm(range(0, len(audio_array), batch_samples), total=total_batches, desc="Transcribing"):
        batch = audio_array[i:i + batch_samples]
    
        #if len(batch) < 1000:
        #    continue
    
        inputs = processor(
            batch,
            sampling_rate=sample_rate,
            return_tensors="pt"
        ).input_features.to("cuda", dtype=torch.float16)
    
        ids = model.generate(inputs, cache_implementation="static")
        text = processor.batch_decode(ids, skip_special_tokens=True)[0]
        transcription.append(text)
    
    return "\n".join(transcription)


In [4]:
data_dir = '../data'

mp3_list = [f for f in os.listdir(data_dir) if f.endswith('.mp3')]
pbar = tqdm(mp3_list, desc="Processing MP3 files")

for filename in pbar:
    if filename.endswith('.mp3'):
        waveform, sample_rate, duration = mp3_2_waveform(
            mp3_path = os.path.join(data_dir,filename),
            target_sr = 16000 #whisper was trained on 16kHz audio
        )
        pbar.set_postfix({"MP3 duration": f"{round(duration,4)} sec"})
        
        transcript = inference(
            model = model,
            processor = processor,
            waveform = waveform,
            sample_rate = sample_rate,
            batch_duration = 30 #whisper was trained on 30-second audio segments
        )

        with open(f"{os.path.basename(filename)}.txt", "w", encoding="utf-8") as f:
            f.write(transcript)

Processing MP3 files:   0%|          | 0/14 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/110 [00:00<?, ?it/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcribing:   0%|          | 0/136 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/32 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/42 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/32 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/115 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/50 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/110 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/45 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/121 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/125 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/41 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/169 [00:00<?, ?it/s]