# Imports

In [43]:
from dataset_generator import DatasetGenerator
from utils import extract_all_chars, save_dict_as_json
from data_preprocessor import Preprocessor

import os
import shutil
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import (
    Wav2Vec2CTCTokenizer, 
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
)


In [44]:
AUDIO_DIR = 'dataset'
DATA_PATH = 'data.csv'

word_character_map = {
    'iskljuci': 'isključi',
    'ukljuci': 'uključi'
}

OUT_DIR = 'torch_datasets'

# Gather from folder

In [45]:
dg = DatasetGenerator(word_character_map)


dg.generate(input_dir=AUDIO_DIR, output_file=DATA_PATH)

Dataset saved to data.csv


In [46]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,audio_filepath,text
0,dataset\iskljuci-19-21-1.wav,isključi
1,dataset\iskljuci-19-21-2.wav,isključi
2,dataset\iskljuci-19-21-3.wav,isključi
3,dataset\iskljuci-38-21-1.wav,isključi
4,dataset\iskljuci-38-21-2.wav,isključi


# Create vocabulary

In [47]:
VOCAB_PATH = 'vocab.json'

In [48]:
words = df['text'].unique()

vocab_list = extract_all_chars(words)

vocab_list.extend(['|', '[UNK]', '[PAD]'])
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'s': 0,
 'č': 1,
 'a': 2,
 'l': 3,
 'u': 4,
 't': 5,
 'k': 6,
 'j': 7,
 'v': 8,
 'e': 9,
 'o': 10,
 'i': 11,
 'r': 12,
 'z': 13,
 '|': 14,
 '[UNK]': 15,
 '[PAD]': 16}

In [49]:
save_dict_as_json(VOCAB_PATH, vocab_dict)

# Loading the tokenizer, feature extractor and processor

In [50]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [51]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, 
                                             sampling_rate=16000, 
                                             padding_value=0.0, 
                                             do_normalize=True, 
                                             return_attention_mask=True)

In [52]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Preprocessing

In [53]:
preprocessor = Preprocessor(processor=processor, sr=16000)

df["processed"] = df.apply(preprocessor.preprocess, axis=1)

df.head()

Unnamed: 0,audio_filepath,text,processed
0,dataset\iskljuci-19-21-1.wav,isključi,"{'input_values': [tensor(0.1908), tensor(0.305..."
1,dataset\iskljuci-19-21-2.wav,isključi,"{'input_values': [tensor(0.0942), tensor(0.229..."
2,dataset\iskljuci-19-21-3.wav,isključi,"{'input_values': [tensor(0.2487), tensor(0.382..."
3,dataset\iskljuci-38-21-1.wav,isključi,"{'input_values': [tensor(-0.0043), tensor(-0.0..."
4,dataset\iskljuci-38-21-2.wav,isključi,"{'input_values': [tensor(-0.0048), tensor(-0.0..."


# Generate PyTorch dataset

In [54]:
# Split dataset
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

# Create a PyTorch Dataset
class AudioDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        processed = self.data.iloc[idx]["processed"]
        return {
            "input_values": processed["input_values"],
            "labels": processed["labels"],
        }

# Prepare datasets
train_dataset = AudioDataset(train_df)
val_dataset = AudioDataset(val_df)

## Save the dataset

In [55]:
shutil.rmtree(OUT_DIR)
os.mkdir(OUT_DIR)

torch.save(train_dataset, 'torch_datasets/train.pt')
torch.save(val_dataset, 'torch_datasets/val.pt')