In [1]:
import random
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import load_dataset, DatasetDict, Audio, Dataset, concatenate_datasets
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import evaluate
import jiwer
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import gc
import shutil
import os
import soundfile as sf
import pandas as pd
import librosa
import torchaudio

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Polish", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Polish", task="transcribe")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('corpus/pl/validated.tsv', sep='\t')
data = data[data["down_votes"] < data["up_votes"]]
data = Dataset.from_pandas(data)

data = data.remove_columns([
    "sentence_domain", "age", "client_id", "down_votes", 
    "gender", "locale", "segment", "variant", "up_votes", "accents", "sentence_id"
])
print(data[0])


  data = pd.read_csv('corpus/pl/validated.tsv', sep='\t')


{'path': 'common_voice_pl_24005493.mp3', 'sentence': 'Ostatnie słowa wymówił syczącym szeptem i ze złośliwym uśmiechem.'}


In [None]:
import os
from datasets import Dataset, DatasetDict, concatenate_datasets
import gc

target_sr = 16000
output_dir = "processed_data"

os.makedirs(output_dir, exist_ok=True)

def load_audio(example):
    audio_array, _ = librosa.load(f"cv-corupus-21/pl/clips/{example['path']}", sr=target_sr, mono=True)
    example["audio"] = audio_array
    return example

def process_and_save_in_chunks(dataset, chunk_size=0.1, output_dir=output_dir):
    total_size = len(dataset)
    chunk_size = int(chunk_size * total_size)

    processed_chunks = []

    for i in range(0, total_size, chunk_size):
        chunk = dataset.select(range(i, min(i + chunk_size, total_size)))
        
        # Apply mapping to load audio
        processed_chunk = chunk.map(load_audio, batched=False)
        
        processed_chunks.append(processed_chunk)

        # Clear memory after each chunk is processed
        del processed_chunk
        import gc
        gc.collect()

    # Concatenate all the processed chunks
    final_dataset = concatenate_datasets(processed_chunks)

    # Save the final dataset to disk
    final_dataset.save_to_disk(output_dir)
    print(f"Dataset saved to {output_dir}")

split_data = data.train_test_split(test_size=0.2, seed=42)
test_valid = split_data["test"].train_test_split(test_size=0.5, seed=42)

process_and_save_in_chunks(split_data["train"], chunk_size=0.1, output_dir=os.path.join(output_dir, "train"))
process_and_save_in_chunks(test_valid["test"], chunk_size=0.1, output_dir=os.path.join(output_dir, "test"))
process_and_save_in_chunks(test_valid["train"], chunk_size=0.1, output_dir=os.path.join(output_dir, "val"))


In [None]:
from datasets import DatasetDict

target_sr = 16000

def load_audio(files):
    return [librosa.load(f"cv-corupus-21/pl/clips/{file}", sr=target_sr, mono=True)[0] for file in files]

split_data = data.train_test_split(test_size=0.2, seed=42)
test_valid = split_data["test"].train_test_split(test_size=0.5, seed=42)

common_voice = DatasetDict({
    "train": split_data["train"].map(lambda x: {"audio": load_audio([x["path"]])[0]}),
    "test": test_valid["test"].map(lambda x: {"audio": load_audio([x["path"]])[0]}),
    "val": test_valid["train"].map(lambda x: {"audio": load_audio([x["path"]])[0]})
})


In [34]:
def split_dataset(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset.select(range(i, min(i + batch_size, len(dataset))))

def save_batch_to_disk(batch_data, file_path, batch_index):
    batch_dataset = Dataset.from_dict(batch_data)
    batch_file_path = f"Prepared_Datasets/{file_path}_batch_{batch_index}"
    if os.path.exists(batch_file_path):
        shutil.rmtree(batch_file_path)
    batch_dataset.save_to_disk(batch_file_path)
    del batch_data
    del batch_dataset
    gc.collect()

train_file_path = "processed_train"
test_file_path = "processed_test"
val_file_path = "processed_val"

batch_size = 1000
if not os.path.exists("Prepared_Datasets"):
    os.makedirs("Prepared_Datasets")

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["train"], batch_size), desc="Processing Training Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, train_file_path, batch_index)

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["test"], batch_size), desc="Processing Testing Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, test_file_path, batch_index)

for batch_index, batch in enumerate(tqdm(split_dataset(common_voice["val"], batch_size), desc="Processing Validation Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio = example["audio"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    save_batch_to_disk(batch_data, val_file_path, batch_index)


Processing Training Data: 0batch [00:00, ?batch/s]


KeyError: 'audio'

## Downloading locally

In [2]:
from datasets import load_dataset

data_dir = "cv-corupus-21/pl/"

# Load dataset from validated.tsv
common_voice = load_dataset("csv", data_files=f"{data_dir}validated.tsv", delimiter="\t")

# Split into train and validation sets
common_voice = common_voice["train"].train_test_split(test_size=0.2, seed=1)
common_voice["test"] = load_dataset("csv", data_files=f"{data_dir}test.tsv", delimiter="\t")

print(common_voice)


Generating train split: 90000 examples [00:00, 189157.75 examples/s]


DatasetGenerationError: An error occurred while generating the dataset

In [5]:
from datasets import load_dataset

# Define dataset path
data_dir = "cv-corupus-21/pl/"

# Load train, validation, and test sets
common_voice = {
    "train": load_dataset("csv", data_files=f"{data_dir}train.tsv", delimiter="\t"),
    "val": load_dataset("csv", data_files=f"{data_dir}dev.tsv", delimiter="\t"),
    "test": load_dataset("csv", data_files=f"{data_dir}test.tsv", delimiter="\t"),
}

print(common_voice)

# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_17_0", "pl", split="train+validation", trust_remote_code=True)
# common_voice["train"], common_voice["val"] = common_voice["train"].train_test_split(test_size=0.2, seed=1).values()
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_17_0", "pl", split="test", trust_remote_code=True)
# common_voice2 = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "variant", "up_votes"])

# print(common_voice2["train"].num_rows, "train samples")
# print(common_voice2["val"].num_rows, "validation samples")
# print(common_voice2["test"].num_rows, "test samples")

# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")
# processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Polish", task="transcribe")


Generating train split: 10000 examples [00:00, 92172.58 examples/s]


DatasetGenerationError: An error occurred while generating the dataset