In [None]:
%pip uninstall torch torchaudio -y
%pip install torch==2.5.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
%pip install 'accelerate>=0.26.0'
%pip install librosa soundfile

In [None]:
import os
import torch
import torchaudio
from transformers import AutoProcessor, WhisperForConditionalGeneration
from tqdm.notebook import tqdm
import librosa
import numpy as np

In [None]:
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
).to("cuda")

In [None]:
def mp3_2_waveform(mp3_path, target_sr = 16000):
    info = torchaudio.info(mp3_path)
    
    waveform, sample_rate = torchaudio.load(mp3_path, num_frames=info.num_frames)
    duration = waveform.shape[1] / sample_rate

    if sample_rate != target_sr: #resample to 16k
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
        sample_rate = target_sr

    if waveform.shape[0] > 1: #mono
        waveform = torch.mean(waveform, dim=0)

    return waveform, sample_rate, duration

def split_into_batches(waveform, sample_rate, batch_duration=30, split_in_silence=False):
    batch_samples = batch_duration * sample_rate
    audio_array = waveform.numpy()
    total_batches = (len(audio_array) + batch_samples - 1) // batch_samples
    
    if split_in_silence:
        audio_array = audio_array.squeeze()
    
        non_silent_intervals = librosa.effects.split(audio_array, top_db=30)
        
        batches = []
        current_batch = []
        current_size = 0
    
        for start, end in non_silent_intervals:
            segment = audio_array[start:end]
            segment_len = end - start
    
            if current_size + segment_len <= batch_samples:
                current_batch.append(segment)
                current_size += segment_len
            else:
                batches.append(np.concatenate(current_batch))
                current_batch = [segment]
                current_size = segment_len
    
        if current_batch:
            batches.append(np.concatenate(current_batch))
        
        return batches
    return [audio_array[i:i + batch_samples] for i in range(0, len(audio_array), batch_samples)]
        
def inference(model, processor, waveform, sample_rate, batch_duration=30):   
    batches = split_into_batches(
        waveform,
        sample_rate,
        batch_duration,
        split_in_silence=True
    )
    
    transcription = []
    for batch in tqdm(batches, total=len(batches), desc="Transcribing"):
    
        #if len(batch) < 1000:
        #    continue
    
        inputs = processor(
            batch,
            sampling_rate=sample_rate,
            return_tensors="pt"
        ).input_features.to("cuda", dtype=torch.float16)
    
        ids = model.generate(inputs, cache_implementation="static")
        text = processor.batch_decode(ids, skip_special_tokens=True)[0]
        transcription.append(text)
    
    return " ".join(transcription)

In [None]:
data_dir = '../data'

mp3_list = [f for f in os.listdir(data_dir) if f.endswith('.mp3')]
pbar = tqdm(mp3_list, desc="Processing MP3 files")

for filename in pbar:
    if filename.endswith('.mp3'):
        waveform, sample_rate, duration = mp3_2_waveform(
            mp3_path = os.path.join(data_dir,filename),
            target_sr = 16000 #whisper was trained on 16kHz audio
        )
        pbar.set_postfix({"MP3 duration": f"{round(duration,4)} sec"})
        
        transcript = inference(
            model = model,
            processor = processor,
            waveform = waveform,
            sample_rate = sample_rate,
            batch_duration = 30 #whisper was trained on 30-second audio segments
        )

        with open(f"{os.path.basename(filename)}.txt", "w", encoding="utf-8") as f:
            f.write(transcript)

In [None]:
waveform, sample_rate, duration = mp3_2_waveform(
    mp3_path = '../data/2019-06-21 2020 Beto O’Rourke on Biden, Iran and the puppy primary.mp3',
)

batches = split_into_batches(waveform, sample_rate, batch_duration=30, split_in_silence=True)

In [None]:
from IPython.display import Audio


In [None]:
for i, batch in enumerate(batches):
    print(f"▶️ Playing batch {i + 1}")
    display(Audio(batch, rate=sample_rate))