In [None]:
import torchaudio
import torch
from transformers import pipeline
from time import perf_counter

In [11]:
MODEL_PATH = "openai/whisper-large-v2"
AUDIO_PATH = "./audio"
OUTPUT_PATH = "./output/transformers/"
REPORT_PATH = "./reports"

In [None]:
torch.cuda.empty_cache()
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v2",
    chunk_length_s=30,
    device="cuda",
    torch_dtype=torch.float32,
)

In [14]:
files = [
]
batch_sizes = [1, 2, 4, 8]

In [15]:
reports = []

In [None]:
for b in batch_sizes:
    reports = []
    torch.cuda.reset_peak_memory_stats()
    for f in files:
        print(f"Starting file {f} @ {b} batch size")
        sample, sr = torchaudio.load(f"{AUDIO_PATH}/{f}")
        sample = torchaudio.functional.resample(
            sample, orig_freq=sr, new_freq=16000
        ).numpy()
        s = perf_counter()
        prediction = pipe(
            sample[0].copy(), batch_size=b, return_timestamps=True,
            generate_kwargs={
                "task": "transcribe", "language": "polish",
                "do_sample": False
            }
        )["chunks"]
        e = perf_counter()
        print(f"It took {e-s}")
        memory = torch.cuda.max_memory_allocated() / 1024 / 1024
        reports.append({f"{f}": f"{e-s:.2f} s with {memory:.2f} MB used"})
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        with open(
                f"{OUTPUT_PATH}/{f}-{b}-float32.txt", "w", encoding="utf-8"
        ) as f:
            f.write("".join([s["text"] for s in prediction]))
    with open(f"{REPORT_PATH}/transformers-{b}-float32.txt", "a") as f:
        for r in reports:
            for k, v in r.items():
                f.write(f"{k}: {v}\n")