# Imports

In [1]:
from dataset_generator import DatasetGenerator
from utils import extract_all_chars, save_dict_as_json
from data_preprocessor import Preprocessor
from data_augmentation import AudioAugmentation
from data_collator import DataCollatorCTCWithPadding

import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import evaluate
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)


In [2]:
AUDIO_DIR = 'dataset'
DATA_PATH = 'data.csv'

word_character_map = {
    'iskljuci': 'isključi',
    'ukljuci': 'uključi'
}

TORCH_DATASETS_DIR = 'torch_datasets'

MODEL_NAME = "wav2vec2-finetuned-voice-commands"
MODELS_DIR = 'models'

# Gather from folder

In [33]:
dg = DatasetGenerator(word_character_map)


dg.generate(input_dir=AUDIO_DIR, output_file=DATA_PATH)

Dataset saved to data.csv


In [34]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,audio_filepath,text
0,dataset/zatvori-38-21-1.wav,zatvori
1,dataset/zvuk-38-21-1.wav,zvuk
2,dataset/zvuk-38-21-3.wav,zvuk
3,dataset/zvuk-19-21-2.wav,zvuk
4,dataset/zvuk-38-21-2.wav,zvuk


# Create vocabulary

In [None]:
VOCAB_PATH = 'vocab.json'

In [None]:
words = df['text'].unique()

vocab_list = extract_all_chars(words)

vocab_list.extend(['|', '[UNK]', '[PAD]'])
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'i': 0,
 'a': 1,
 'e': 2,
 'k': 3,
 'č': 4,
 'j': 5,
 'u': 6,
 'z': 7,
 'l': 8,
 'o': 9,
 'v': 10,
 's': 11,
 'r': 12,
 't': 13,
 '|': 14,
 '[UNK]': 15,
 '[PAD]': 16}

In [None]:
save_dict_as_json(VOCAB_PATH, vocab_dict)

# Loading the tokenizer, feature extractor and processor

In [22]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [23]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [24]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Splitting, Preprocessing and Augmentation

In [31]:
aug = AudioAugmentation(min_noise=0, max_noise=.005, time_stretch_rate=.9, pitch_shift_n_steps=2)

train_preprocessor = Preprocessor(processor=processor, sr=16000, audio_augmentation=aug, augment_count=2)
val_preprocessor = Preprocessor(processor=processor, sr=16000)

We need to make sure that the same speaker's audios cannot be in both train and validation dataset. I will also remove my audios from the dataset so that I can test it on my voice.

In [36]:
def extract_parts(row):
    # Safely remove 'text' and split remaining path
    audio_path = row['audio_filepath'].split('/',1)[1]
    audio_path = audio_path.split('-',1)[1]
    audio_path = audio_path.replace('.wav', '')
    parts = audio_path.split('-', 2)  # Split into at most 3 parts (speaker, year, version)
    return pd.Series(parts + [None] * (3 - len(parts)))  # Pad with None if version is missing

df[['speaker', 'year', 'version']] = df.apply(extract_parts, axis=1)

df['version'].fillna(1)
df[['speaker', 'year']] = df[['speaker', 'year']].astype(int)  # Convert to integer
# Remove files with speaker 38 and year 21 - This is my voice, I want to test it later on my voice
df = df[~((df['speaker'] == 38) & (df['year'] == 21))]

# Group by speaker-year
groups = list(df.groupby(['speaker', 'year']).groups.keys())
np.random.seed(42)
np.random.shuffle(groups)  # Shuffle the groups

# Split into train (80%) and validation (20%)
split_idx = int(0.8 * len(groups))
train_groups = set(groups[:split_idx])
val_groups = set(groups[split_idx:])

# Assign to train and validation sets
train_df = df[df.set_index(['speaker', 'year']).index.isin(train_groups)]
val_df = df[df.set_index(['speaker', 'year']).index.isin(val_groups)]

# Drop extra columns if needed
train_df = train_df[['audio_filepath', 'text']]
val_df = val_df[['audio_filepath', 'text']]

print("Train Set:")
print(train_df)
print("\nValidation Set:")
print(val_df)

Train Set:
                  audio_filepath     text
3       dataset/zvuk-19-21-2.wav     zvuk
6       dataset/zvuk-19-21-3.wav     zvuk
8    dataset/zatvori-19-21-3.wav  zatvori
9       dataset/zvuk-19-21-1.wav     zvuk
11    dataset/otvori-19-21-1.wav   otvori
..                           ...      ...
114  dataset/zatvori-78-22-1.wav  zatvori
115   dataset/zatvori-148-22.wav  zatvori
116     dataset/zvuk-37-21-2.wav     zvuk
117     dataset/zvuk-37-21-1.wav     zvuk
118     dataset/zvuk-37-21-3.wav     zvuk

[85 rows x 2 columns]

Validation Set:
                   audio_filepath      text
46       dataset/zvuk-63-21-1.wav      zvuk
55   dataset/iskljuci-63-21-1.wav  isključi
57       dataset/zvuk-89-22-1.wav      zvuk
63   dataset/iskljuci-89-22-1.wav  isključi
70     dataset/otvori-63-21-1.wav    otvori
78     dataset/otvori-89-22-1.wav    otvori
85     dataset/svetlo-89-22-1.wav    svetlo
86     dataset/svetlo-63-21-1.wav    svetlo
89    dataset/ukljuci-89-22-1.wav   uključi
98   

In [37]:
# Preprocess data
preprocessed_train_data = []
preprocessed_val_data = []

# Preprocess training data
for _, row in train_df.iterrows():
    preprocessed_train_data.extend(train_preprocessor.preprocess(row))

# Preprocess validation data
for _, row in val_df.iterrows():
    preprocessed_val_data.extend(val_preprocessor.preprocess(row))

# Create new dataframes for the preprocessed data
train_df = pd.DataFrame(preprocessed_train_data)
val_df = pd.DataFrame(preprocessed_val_data)

train_df.head()

Unnamed: 0,input_values,labels
0,"[tensor(0.1722), tensor(0.2742), tensor(0.3092...","[tensor(7), tensor(10), tensor(6), tensor(3)]"
1,"[tensor(0.4646), tensor(0.7110), tensor(0.1649...","[tensor(7), tensor(10), tensor(6), tensor(3)]"
2,"[tensor(-0.0084), tensor(0.7243), tensor(0.259...","[tensor(7), tensor(10), tensor(6), tensor(3)]"
3,"[tensor(-0.0727), tensor(-0.9619), tensor(-0.0...","[tensor(7), tensor(10), tensor(6), tensor(3)]"
4,"[tensor(-0.2181), tensor(-0.8109), tensor(1.01...","[tensor(7), tensor(10), tensor(6), tensor(3)]"


# Generate PyTorch dataset

In [39]:
# Create a PyTorch Dataset class
class AudioDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_values": self.data.iloc[idx]["input_values"],
            "labels": self.data.iloc[idx]["labels"],
        }

# Prepare datasets
train_dataset = AudioDataset(train_df)
val_dataset = AudioDataset(val_df)

## Save the dataset

In [40]:
if not os.path.exists(TORCH_DATASETS_DIR):
    os.mkdir(TORCH_DATASETS_DIR)

torch.save(train_dataset, os.path.join(TORCH_DATASETS_DIR, 'train.pt'))
torch.save(val_dataset, os.path.join(TORCH_DATASETS_DIR, 'val.pt'))

# Loading the model

In [None]:
HF_TOKEN = os.environ.get('HF_TOKEN')

In [25]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.2,
    hidden_dropout=0.2,
    feat_proj_dropout=0.05,
    mask_time_prob=0.04,
    layerdrop=0.15,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    token = HF_TOKEN
)

model.config.vocab_size = len(processor.tokenizer)

model.freeze_feature_encoder()
model.gradient_checkpointing_enable()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [26]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [27]:
if not os.path.exists(MODELS_DIR):
    os.mkdir(MODELS_DIR)

model_out_dir = os.path.join(MODELS_DIR, MODEL_NAME)
if not os.path.exists(model_out_dir):
    os.mkdir(model_out_dir)

logs_dir = os.path.join(MODELS_DIR, MODEL_NAME, 'logs')
if not os.path.exists(logs_dir):
    os.mkdir(logs_dir)

training_args = TrainingArguments(
    output_dir=model_out_dir,                               # Directory to save model checkpoints
    overwrite_output_dir=True,
    eval_strategy="steps",                                  # Evaluate every N steps
    per_device_train_batch_size=8,                          # Batch size for training
    per_device_eval_batch_size=8,                           # Batch size for evaluation
    gradient_accumulation_steps=2,                          # Gradient accumulation
    learning_rate=3e-4,                                     # Learning rate
    warmup_steps=500,                                       # Warmup steps for LR scheduler
    num_train_epochs=100,                                   # Number of epochs
    logging_dir=logs_dir,                                   # Directory for logging
    logging_steps=10,                                       # Log every N steps
    save_steps=50,                                          # Save checkpoint every N steps
    eval_steps=50,
    save_total_limit=2,                                     # Only keep the last 2 checkpoints
    fp16=True,                                              # Use mixed precision
    dataloader_num_workers=2,                               # Number of workers for DataLoader
    load_best_model_at_end=True,                            # Load the best model at the end
    metric_for_best_model="wer",                            # Metric to determine best model
    greater_is_better=False,                                # Smaller WER is better
    save_safetensors=False,
    seed=42,                                                # Random seed for reproducibility
)


In [28]:
# Define WER metric
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.tensor(pred_logits), dim=-1)

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [41]:
# Initialize Trainer
trainer = Trainer(
    model=model,                               # Wav2Vec2 model
    data_collator=data_collator,               # Data Collator
    args=training_args,                        # Training arguments
    train_dataset=train_dataset,               # Training dataset
    eval_dataset=val_dataset,                  # Validation dataset
    processing_class=processor,                # Processor
    compute_metrics=compute_metrics,           # WER metric
)

In [None]:
# Start training
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmihailo-radovic11[0m ([33mmihailo-radovic11-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Wer
50,82.4204,36.380486,1.0
100,40.0761,11.197175,1.0


Step,Training Loss,Validation Loss,Wer
50,82.4204,36.380486,1.0
100,40.0761,11.197175,1.0
150,7.2235,3.168533,1.0
200,6.0628,2.939375,1.0
250,5.949,2.916493,1.0
300,5.8396,2.906181,1.0
350,5.8087,2.914468,1.0
400,5.7381,2.841084,1.0
450,5.0938,2.332966,1.0
500,4.6319,2.114949,1.0


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 2304414144 vs 2304414032

In [44]:
trainer.train(resume_from_checkpoint=True)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Currently logged in as: [33mmihailo-radovic11[0m ([33mmihailo-radovic11-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112468488893986, max=1.0…

  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss,Wer
1250,0.2534,0.002069,0.0
1300,0.3443,0.001716,0.0
1350,0.2666,0.002186,0.0
1400,0.1565,0.001543,0.0


Step,Training Loss,Validation Loss,Wer
1250,0.2534,0.002069,0.0
1300,0.3443,0.001716,0.0
1350,0.2666,0.002186,0.0
1400,0.1565,0.001543,0.0
1450,0.2751,0.002841,0.0
1500,0.3315,0.001397,0.0
1550,0.1681,0.00124,0.0
1600,0.2431,0.001211,0.0


TrainOutput(global_step=1600, training_loss=0.06577926870435476, metrics={'train_runtime': 646.6687, 'train_samples_per_second': 39.433, 'train_steps_per_second': 2.474, 'total_flos': 2.692254420058742e+18, 'train_loss': 0.06577926870435476, 'epoch': 100.0})

In [45]:
final_metrics = trainer.evaluate(val_dataset)
print(final_metrics)

{'eval_loss': 0.019305603578686714, 'eval_wer': 0.0, 'eval_runtime': 0.4752, 'eval_samples_per_second': 29.46, 'eval_steps_per_second': 4.209, 'epoch': 100.0}


In [46]:
model.save_pretrained(f"{MODELS_DIR}/best_model")
processor.save_pretrained(f"{MODELS_DIR}/best_model_processor")

[]