In [1]:
import random
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import load_dataset, DatasetDict, Audio, Dataset, concatenate_datasets
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import evaluate
import jiwer
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import gc
import shutil
import os
import json
import soundfile as sf
import pandas as pd
import librosa
import torchaudio

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Polish", task="transcribe")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
dataset1 = load_dataset("facebook/multilingual_librispeech", "polish", trust_remote_code=True)

def convert_example(example):
    return {
        'array': example['audio']['array'],
        'sentence': example['transcript']
    }

converted_train = dataset1['train'].map(convert_example, remove_columns=dataset1['train'].column_names)
print(converted_train[0].keys())

dict_keys(['array', 'sentence'])


In [None]:
batch_size = 1000
train_file_path = "processed_train_data"
os.makedirs(train_file_path, exist_ok=True)

def split_dataset(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i:i + batch_size]

for batch_index, batch in enumerate(tqdm(split_dataset(converted_train, batch_size), desc="Processing Training Data", unit="batch")):
    batch_data = {"input_features": [], "labels": []}
    for example in batch:
        audio_array = example["array"]
        sentence = example["sentence"]
        input_features = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt").input_features[0]
        labels = tokenizer(sentence, return_tensors="pt").input_ids[0]
        batch_data["input_features"].append(input_features.numpy().tolist())
        batch_data["labels"].append(labels.numpy().tolist())
    
    batch_file = os.path.join(train_file_path, f"batch_{batch_index}.json")
    with open(batch_file, "w") as f:
        json.dump(batch_data, f)

Processing Training Data: 0batch [00:00, ?batch/s]

In [17]:
print(type(dataset1))
print(dataset1['train'][0])


<class 'datasets.dataset_dict.DatasetDict'>
{'audio': {'path': '6892_10350_000000.opus', 'array': array([-4.18551717e-05, -9.50167669e-05, -5.09495039e-05, ...,
       -1.45179487e-03, -1.45464053e-03, -1.39807537e-03]), 'sampling_rate': 16000}, 'original_path': 'http://www.archive.org/download/myszeis_1511_librivox/myszeis_02_krasicki_64kb.mp3', 'begin_time': 29.07, 'end_time': 40.65, 'transcript': 'drugi w mniemaniu że honory bliskie nędzny w istocie nadzieją się cieszy tymczasem kiedy los szczęścia zagrodzi i tron nie wesprze i mierność zaszkodzi', 'audio_duration': 11.579999999999998, 'speaker_id': '6892', 'chapter_id': '10350', 'file': '6892_10350_000000.opus', 'id': '6892_10350_000000'}


In [None]:
converted_dataset1['train'][0]

{'audio': {'path': '6892_10350_000000.opus',
  'array': array([-4.18551717e-05, -9.50167669e-05, -5.09495039e-05, ...,
         -1.45179487e-03, -1.45464053e-03, -1.39807537e-03]),
  'sampling_rate': 16000},
 'transcript': 'drugi w mniemaniu że honory bliskie nędzny w istocie nadzieją się cieszy tymczasem kiedy los szczęścia zagrodzi i tron nie wesprze i mierność zaszkodzi'}

In [None]:
dataset2 = load_dataset("facebook/voxpopuli", "pl", trust_remote_code=True)

n_files.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

asr_train.tsv:   0%|          | 0.00/14.0M [00:00<?, ?B/s]

asr_dev.tsv:   0%|          | 0.00/666k [00:00<?, ?B/s]

asr_test.tsv:   0%|          | 0.00/695k [00:00<?, ?B/s]

train_part_0.tar.gz:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

train_part_1.tar.gz:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [5]:
dataset2['train'][0]

NameError: name 'dataset2' is not defined