In [1]:
from dataset_generator import DatasetGenerator
from utils import extract_all_chars, save_dict_as_json

import pandas as pd
from sklearn.model_selection import train_test_split
import torchaudio
from transformers import (
    Wav2Vec2CTCTokenizer, 
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
AUDIO_DIR = 'dataset'
DATA_PATH = 'data.csv'
VOCAB_PATH = 'vocab.json'

In [3]:
word_character_map = {
    'iskljuci': 'isključi',
    'ukljuci': 'uključi'
}

dg = DatasetGenerator(word_character_map)


dg.generate(input_dir=AUDIO_DIR, output_file=DATA_PATH)

Dataset saved to data.csv


In [4]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,audio_filepath,text
0,dataset\iskljuci-19-21-1.wav,isključi
1,dataset\iskljuci-19-21-2.wav,isključi
2,dataset\iskljuci-19-21-3.wav,isključi
3,dataset\iskljuci-38-21-1.wav,isključi
4,dataset\iskljuci-38-21-2.wav,isključi


In [5]:
words = df['text'].unique()

vocab_list = extract_all_chars(words)

vocab_list.extend(['|', '[UNK]', '[PAD]'])
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'t': 0,
 'i': 1,
 'l': 2,
 'u': 3,
 'r': 4,
 'a': 5,
 'č': 6,
 'e': 7,
 'j': 8,
 'k': 9,
 'o': 10,
 'v': 11,
 'z': 12,
 's': 13,
 '|': 14,
 '[UNK]': 15,
 '[PAD]': 16}

In [8]:
save_dict_as_json(VOCAB_PATH, vocab_dict)

In [9]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [10]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, 
                                             sampling_rate=16000, 
                                             padding_value=0.0, 
                                             do_normalize=True, 
                                             return_attention_mask=True)

In [11]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)