In [1]:
import random
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import load_dataset, DatasetDict, Audio, Dataset, concatenate_datasets
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import evaluate
import jiwer
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import gradio as gr
from tqdm import tqdm
import gc
import shutil
import os

common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_17_0", "pl", split="train+validation", trust_remote_code=True)
common_voice["train"], common_voice["val"] = common_voice["train"].train_test_split(test_size=0.2, seed=1).values()
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_17_0", "pl", split="test", trust_remote_code=True)
common_voice2 = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "variant", "up_votes"])

print(common_voice2["train"].num_rows, "train samples")
print(common_voice2["val"].num_rows, "validation samples")
print(common_voice2["test"].num_rows, "test samples")

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")


23967 train samples
5992 validation samples
9230 test samples


In [2]:
import random

def reduce_dataset(dataset, retain_percentage):
    if not (0 < retain_percentage <= 1):
        raise ValueError("retain_percentage must be between 0 and 1")
    original_num_rows = dataset.num_rows
    if original_num_rows == 0:
        raise ValueError("The dataset is empty")
    new_num_rows = int(original_num_rows * retain_percentage)
    if new_num_rows == 0:
        raise ValueError("retain_percentage is too low; no rows would be retained")
    return dataset.select(
        indices=random.sample(range(original_num_rows), new_num_rows)
    )

common_voice2 = common_voice2.cast_column("audio", Audio(sampling_rate=16000))

reduce = 1
common_voice = DatasetDict({
    "train": reduce_dataset(common_voice2["train"], reduce),
    "test": reduce_dataset(common_voice2["test"], reduce),
    "val": reduce_dataset(common_voice2["val"], reduce)
})

print(common_voice)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 23967
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 9230
    })
    val: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5992
    })
})


In [None]:
def split_dataset(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset.select(range(i, min(i + batch_size, len(dataset))))

def save_batch_to_disk(batch_data, file_path, batch_index):
    batch_dataset = Dataset.from_dict(batch_data)
    batch_file_path = f"Prepared_Datasets/{file_path}_batch_{batch_index}"
    if os.path.exists(batch_file_path):
        shutil.rmtree(batch_file_path)
    batch_dataset.save_to_disk(batch_file_path)
    del batch_data
    del batch_dataset
    gc.collect()

train_file_path = "processed_train"
test_file_path = "processed_test"
val_file_path = "processed_val"

batch_size = 1000
if not os.path.exists("Prepared_Datasets"):
    os.makedirs("Prepared_Datasets")

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["train"], batch_size), desc="Processing Training Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, train_file_path, batch_index)

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["test"], batch_size), desc="Processing Testing Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, test_file_path, batch_index)

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["val"], batch_size), desc="Processing Validation Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, val_file_path, batch_index)
