In [None]:
!pip install datasets transformers seqeval evaluate
!pip install transformers[torch]
!pip install accelerate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = '/content/friends-romans-countrymen.wav' # Direct the location of the audio file.

result = pipe(sample)
result

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'text': " Friends, Romans, countrymen, lend me your ears. I come to bury Caesar, not to praise him. The evil that men do lives after them. The good is often turd with their bones. So let it be with Caesar. The noble Brutus hath told you Caesar was ambitious. If it were so, it was a grievous fault, and grievously hath Caesar answered it. Here, under leave of Brutus and the rest, for Brutus is an honorable man, so are they all, all honorable men, come I to speak in Caesar's funeral. He was my friend, faithful and just to me. But Brutus says he was ambitious, and Brutus is an honorable man. He hath brought many captives home to Rome, whose ransoms did the general coffers fill. Did this in Caesar seem ambitious? When that the poor have cried, Caesar hath wept. Ambition should be made of sterner stuff. Yet Brutus says he was ambitious, and Brutus is an honourable man. You all did see that on the Lupercal I thrice presented him a kingly crown, which he did thrice refuse. Was this ambition? 