In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset, Audio
import os
import pandas as pd

In [3]:
# taken from https://huggingface.co/openai/whisper-large-v3

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=5,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
path = "../data/bagoflies/BagOfLies/audio"

file_names = []
for filename in os.listdir(path):
    if filename.endswith(".wav"):
        file_names.append(os.path.join(path, filename))
        
dataset = load_dataset("audiofolder", data_files=file_names, split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))



Resolving data files:   0%|          | 0/325 [00:00<?, ?it/s]

Dataset({
    features: ['audio'],
    num_rows: 325
})


In [None]:
transcriptions = []
batch_size = 5
for i in range(220, len(dataset), batch_size):
    batch = dataset[i:i+batch_size]["audio"]
    results = pipe(batch, generate_kwargs={"language": "english"})
    batch_transcriptions = [result["text"] for result in results]
    print(batch_transcriptions)
    transcriptions.extend(batch_transcriptions)

In [5]:
transcriptions.to_csv("../data/bagoflies/BagOfLies/transcription/transcriptions.csv")

In [23]:
transcriptions = pd.read_csv("../data/bagoflies/BagOfLies/transcription/transcriptions.csv")
# Add the corresponding file names
transcriptions["file_name"] = file_names
transcriptions.to_csv("../data/bagoflies/BagOfLies/transcription/transcriptions.csv")
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,text,file_name
0,0,There is a river and a bridge crossing over i...,../data/bagoflies/BagOfLies/audio\.._data_bago...
1,1,A Labrador has jumped to catch a Frisbee and ...,../data/bagoflies/BagOfLies/audio\.._data_bago...
2,2,A plane is trying to land in the river. The p...,../data/bagoflies/BagOfLies/audio\.._data_bago...
3,3,There is a girl wearing a blue dress and she ...,../data/bagoflies/BagOfLies/audio\.._data_bago...
4,4,A boy is drinking coffee and reading a newspa...,../data/bagoflies/BagOfLies/audio\.._data_bago...
