In [None]:
%pip uninstall torch torchaudio -y
%pip install torch==2.5.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
%pip install 'accelerate>=0.26.0'
%pip install librosa soundfile
%pip install nltk

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torchaudio
from transformers import AutoProcessor, WhisperForConditionalGeneration
import librosa
import nltk
nltk.data.path.append("./utils")
nltk.download('punkt', download_dir="./utils")
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm

In [None]:
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
).to("cuda")

In [None]:
def mp3_2_waveform(mp3_path, target_sr = 16000):
    info = torchaudio.info(mp3_path)
    
    waveform, sample_rate = torchaudio.load(mp3_path, num_frames=info.num_frames)
    duration = waveform.shape[1] / sample_rate

    if sample_rate != target_sr: #resample to 16k
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
        sample_rate = target_sr

    if waveform.shape[0] > 1: #mono
        waveform = torch.mean(waveform, dim=0)

    return waveform, sample_rate, duration

def split_into_batches(waveform, sample_rate, batch_duration=30, split_in_silence=False):
    batch_samples = batch_duration * sample_rate
    audio_array = waveform.numpy()
    total_batches = (len(audio_array) + batch_samples - 1) // batch_samples
    
    if split_in_silence:
        audio_array = audio_array.squeeze()
    
        non_silent_intervals = librosa.effects.split(audio_array, top_db=30)
        
        batches = []
        current_batch = []
        current_size = 0
    
        for start, end in non_silent_intervals:
            segment = audio_array[start:end]
            segment_len = end - start
    
            if current_size + segment_len <= batch_samples:
                current_batch.append(segment)
                current_size += segment_len
            else:
                batches.append(np.concatenate(current_batch))
                current_batch = [segment]
                current_size = segment_len
    
        if current_batch:
            batches.append(np.concatenate(current_batch))
        
        return batches
    return [audio_array[i:i + batch_samples] for i in range(0, len(audio_array), batch_samples)]
        
def inference(model, processor, waveform, sample_rate, batch_duration=30):   
    batches = split_into_batches(
        waveform,
        sample_rate,
        batch_duration,
        split_in_silence=True
    )
    
    transcription = []
    for batch in tqdm(batches, total=len(batches), desc="Transcribing"):
    
        #if len(batch) < 1000:
        #    continue
    
        inputs = processor(
            batch,
            sampling_rate=sample_rate,
            return_tensors="pt"
        ).input_features.to("cuda", dtype=torch.float16)
    
        ids = model.generate(inputs, cache_implementation="static")
        text = processor.batch_decode(ids, skip_special_tokens=True)[0]
        transcription.append(text)
    
    return " ".join(transcription)

In [None]:
data_dir = '../data'

mp3_list = [f for f in os.listdir(data_dir) if f.endswith('.mp3')]
pbar = tqdm(mp3_list, desc="Processing MP3 files")

for filename in pbar:
    if filename.endswith('.mp3'):
        waveform, sample_rate, duration = mp3_2_waveform(
            mp3_path = os.path.join(data_dir,filename),
            target_sr = 16000 #whisper was trained on 16kHz audio
        )
        pbar.set_postfix({"MP3 duration": f"{round(duration,4)} sec"})
        
        transcript = inference(
            model = model,
            processor = processor,
            waveform = waveform,
            sample_rate = sample_rate,
            batch_duration = 30 #whisper was trained on 30-second audio segments
        )

        with open(f"{os.path.basename(filename)}.txt", "w", encoding="utf-8") as f:
            f.write(transcript)

In [None]:
data_dir = '../data'

mp3_list = [f for f in os.listdir(data_dir) if f.endswith('.mp3')]
pbar = tqdm(mp3_list, desc="Processing MP3 files")

out_dict = {}

for filename in pbar:
    if filename.endswith('.mp3'):
        waveform, sample_rate, duration = mp3_2_waveform(
            mp3_path = os.path.join(data_dir,filename),
            target_sr = 16000 #whisper was trained on 16kHz audio
        )
        pbar.set_postfix({"MP3 duration": f"{round(duration,4)} sec"})
        
        transcript = inference(
            model = model,
            processor = processor,
            waveform = waveform,
            sample_rate = sample_rate,
            batch_duration = 30 #whisper was trained on 30-second audio segments
        )
        
        with open(f"output/txts/{os.path.basename(filename)}.txt", "w", encoding="utf-8") as f:
            f.write(transcript)
            
        sentences = sent_tokenize(transcript)
        
        pd.DataFrame({
            "sentence_id": range(1, len(sentences) + 1),
            "sentence": sentences
        }).to_csv(f"output/csvs/{os.path.basename(filename)}.csv", index = False)
        

In [None]:
waveform, sample_rate, duration = mp3_2_waveform(
    mp3_path = '../data/2019-06-21 2020 Beto O’Rourke on Biden, Iran and the puppy primary.mp3',
)

batches = split_into_batches(waveform, sample_rate, batch_duration=30, split_in_silence=True)

In [None]:
from IPython.display import Audio

In [None]:
for i, batch in enumerate(batches):
    print(f"▶️ Playing batch {i + 1}")
    display(Audio(batch, rate=sample_rate))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import csv

output_dir = 'cap-preds'
batch_size = 40
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
maxlen = 512

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForSequenceClassification.from_pretrained(
    'poltextlab/xlm-roberta-large-english-cap-v3'
).to(device)
model.eval()

csv_files = [filename for filename in os.listdir('./') if filename.endswith('.csv')]
for csv_file_path in tqdm(csv_files, desc="Processing CSV files"):
    ###
    resume_batch = 0
    ###

    test_data = pd.read_csv(csv_file_path)

    label_map = {v: k for k, v in model.config.label2id.items()}
    labels = list(label_map.keys())

    output_path = os.path.join(output_dir, f"{os.path.basename(csv_file_path)[:-4]}_preds.csv")
    os.makedirs(output_dir, exist_ok=True)

    if resume_batch == 0:
        with open(output_path, "w", newline='', encoding="utf-8") as f_out:
            writer = csv.writer(f_out)
            header = ["sentence_id","sentence","pred","pred_name"] + [f"softmax_{label}" for label in labels]
            writer.writerow(header)
            
    for i in tqdm(range(resume_batch, len(test_data), batch_size), desc="Processing Batches"):
        batch = test_data.iloc[i:i + batch_size]
        texts = batch["sentence"].tolist()
        sentence_ids = batch["sentence_id"].tolist()
        
        inputs = tokenizer(texts, max_length=maxlen, truncation=True, padding=True, return_tensors="pt").to(device)
        
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.nn.functional.softmax(logits, dim=1).cpu().tolist()
            predicted_classes = torch.tensor(probs).argmax(dim=1).tolist()
        
        predicted_label_names = [label_map[pred] for pred in predicted_classes]

        with open(output_path, "a", newline='', encoding="utf-8") as f_out:
            writer = csv.writer(f_out)
            for sentence_id, text, predicted_label, predicted_label_name, prob in zip(
                sentence_ids,
                texts,
                predicted_classes,
                predicted_label_names,
                probs  
            ):
                prob_str = [f"{p:.6f}" for p in prob]
                writer.writerow([sentence_id,text,predicted_label,predicted_label_name] + prob_str)

    
    print(f"Predictions written to: {output_path}")

In [None]:
raw_text_filenames = [filename for filename in os.listdir('./') if filename.endswith('.txt')]

for raw_text_filename in raw_text_filenames[:1]:
    with open(raw_text_filename, "r", encoding="utf-8") as f:
        print(f.read())

In [None]:
raw_text_filenames