In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
import os
import librosa
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import Audio

def process_audio_file(file_path):
    audio_path = os.path.join(folder_name, file_path)
    audio_array, sampling_rate = librosa.load(audio_path, sr=None)
    return {
        'path': audio_path,
        'array': audio_array,
        'sampling_rate': sampling_rate
    }

def prepare_dataset(batch, feature_extractor, tokenizer):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
folder_name = r"Datasets\cv-corpus-17.0-delta-2024-03-15\pl\clips"
df = pd.read_csv(r"Datasets\cv-corpus-17.0-delta-2024-03-15\pl\validated.tsv", sep="\t")

df['audio'] = df['path'].apply(lambda x: process_audio_file(x))
data_dict = {
    'sentence': df['sentence'].tolist(),
    'audio': df['audio'].tolist()
}

dataset = Dataset.from_dict(data_dict)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")
dataset = dataset.map(
    prepare_dataset,
    fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
    remove_columns=dataset.column_names,
    num_proc=2,
)

# Check the first few entries in the 'audio' column before saving
print(dataset[0]['audio'])

dataset.save_to_disk("processed_dataset")

Map (num_proc=2): 100%|██████████| 510/510 [13:55<00:00,  1.64s/ examples] 
Saving the dataset (1/1 shards): 100%|██████████| 510/510 [00:06<00:00, 83.91 examples/s]


In [19]:
dataset = load_from_disk("processed_dataset")
print(dataset[0].keys())

dict_keys(['input_features', 'labels'])


In [2]:
import pandas as pd
df = pd.read_csv(r"Datasets\cv-corpus-17.0-delta-2024-03-15\pl\validated.tsv", sep="\t")
print(df.columns)

Index(['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain',
       'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant',
       'locale', 'segment'],
      dtype='object')
