# Imports

In [2]:
from dataset_generator import DatasetGenerator
from utils import extract_all_chars, save_dict_as_json
from data_preprocessor import Preprocessor
from data_augmentation import AudioAugmentation
from data_collator import DataCollatorCTCWithPadding

import os
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import torch
from torch.utils.data import Dataset
import evaluate
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2Processor,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)


In [3]:
AUDIO_DIR = 'dataset'
DATA_PATH = 'data.csv'

word_character_map = {
    'iskljuci': 'isključi',
    'ukljuci': 'uključi'
}

TORCH_DATASETS_DIR = 'torch_datasets'

MODEL_NAME = "wav2vec2-finetuned-voice-commands"
MODELS_DIR = 'models'

# Gather from folder

In [12]:
dg = DatasetGenerator(word_character_map)


dg.generate(input_dir=AUDIO_DIR, output_file=DATA_PATH)

Dataset saved to data.csv


In [13]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,audio_filepath,text
0,dataset/zatvori-38-21-1.wav,zatvori
1,dataset/zvuk-19-21-3.wav,zvuk
2,dataset/zatvori-38-21-3.wav,zatvori
3,dataset/zatvori-19-21-3.wav,zatvori
4,dataset/zvuk-19-21-2.wav,zvuk


# Create vocabulary

In [14]:
VOCAB_PATH = 'vocab.json'

In [15]:
words = df['text'].unique()

vocab_list = extract_all_chars(words)

vocab_list.extend(['|', '[UNK]', '[PAD]'])
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'j': 0,
 'e': 1,
 'u': 2,
 'z': 3,
 'č': 4,
 's': 5,
 't': 6,
 'k': 7,
 'v': 8,
 'r': 9,
 'l': 10,
 'i': 11,
 'o': 12,
 'a': 13,
 '|': 14,
 '[UNK]': 15,
 '[PAD]': 16}

In [16]:
save_dict_as_json(VOCAB_PATH, vocab_dict)

# Loading the tokenizer, feature extractor and processor

In [17]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [18]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [19]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Preprocessing and Augmentation

In [20]:
aug = AudioAugmentation(min_noise=0, max_noise=.005, time_stretch_rate=.9, pitch_shift_n_steps=2)

train_preprocessor = Preprocessor(processor=processor, sr=16000, audio_augmentation=aug, augment_count=2)
val_preprocessor = Preprocessor(processor=processor, sr=16000)

In [21]:
# Split dataset into train and validation
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

# Preprocess data
preprocessed_train_data = []
preprocessed_val_data = []

# Preprocess training data
for _, row in train_df.iterrows():
    preprocessed_train_data.extend(train_preprocessor.preprocess(row))

# Preprocess validation data
for _, row in val_df.iterrows():
    preprocessed_val_data.extend(val_preprocessor.preprocess(row))

# Create new dataframes for the preprocessed data
train_df = pd.DataFrame(preprocessed_train_data)
val_df = pd.DataFrame(preprocessed_val_data)

train_df.head()

Unnamed: 0,input_values,labels
0,"[tensor(-0.0017), tensor(-0.0017), tensor(-0.0...","[tensor(3), tensor(8), tensor(2), tensor(7)]"
1,"[tensor(-0.1495), tensor(-0.1662), tensor(0.02...","[tensor(3), tensor(8), tensor(2), tensor(7)]"
2,"[tensor(0.0037), tensor(0.1035), tensor(-0.038...","[tensor(3), tensor(8), tensor(2), tensor(7)]"
3,"[tensor(0.0942), tensor(0.2292), tensor(0.2020...","[tensor(11), tensor(5), tensor(7), tensor(10),..."
4,"[tensor(-0.7249), tensor(-0.0991), tensor(0.27...","[tensor(11), tensor(5), tensor(7), tensor(10),..."


# Generate PyTorch dataset

In [22]:
# Create a PyTorch Dataset class
class AudioDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_values": self.data.iloc[idx]["input_values"],
            "labels": self.data.iloc[idx]["labels"],
        }

# Prepare datasets
train_dataset = AudioDataset(train_df)
val_dataset = AudioDataset(val_df)

## Save the dataset

In [23]:
if not os.path.exists(TORCH_DATASETS_DIR):
    os.mkdir(TORCH_DATASETS_DIR)

torch.save(train_dataset, os.path.join(TORCH_DATASETS_DIR, 'train.pt'))
torch.save(val_dataset, os.path.join(TORCH_DATASETS_DIR, 'val.pt'))

# Loading the model

In [24]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.2,
    hidden_dropout=0.2,
    feat_proj_dropout=0.05,
    mask_time_prob=0.04,
    layerdrop=0.15,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.config.vocab_size = len(processor.tokenizer)

model.freeze_feature_encoder()
model.gradient_checkpointing_enable()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [25]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [26]:
if not os.path.exists(MODELS_DIR):
    os.mkdir(MODELS_DIR)

model_out_dir = os.path.join(MODELS_DIR, MODEL_NAME)
if not os.path.exists(model_out_dir):
    os.mkdir(model_out_dir)

logs_dir = os.path.join(MODELS_DIR, MODEL_NAME, 'logs')
if not os.path.exists(logs_dir):
    os.mkdir(logs_dir)

training_args = TrainingArguments(
    output_dir=model_out_dir,                               # Directory to save model checkpoints
    overwrite_output_dir=True,
    eval_strategy="steps",                                  # Evaluate every N steps
    per_device_train_batch_size=8,                          # Batch size for training
    per_device_eval_batch_size=8,                           # Batch size for evaluation
    gradient_accumulation_steps=2,                          # Gradient accumulation
    learning_rate=3e-4,                                     # Learning rate
    warmup_steps=500,                                       # Warmup steps for LR scheduler
    num_train_epochs=200,                                   # Number of epochs
    logging_dir=logs_dir,                                   # Directory for logging
    logging_steps=10,                                       # Log every N steps
    save_steps=50,                                          # Save checkpoint every N steps
    eval_steps=50,
    save_total_limit=2,                                     # Only keep the last 2 checkpoints
    fp16=True,                                              # Use mixed precision
    dataloader_num_workers=2,                               # Number of workers for DataLoader
    load_best_model_at_end=True,                            # Load the best model at the end
    metric_for_best_model="wer",                            # Metric to determine best model
    greater_is_better=False,                                # Smaller WER is better
    save_safetensors=False,
    seed=42,                                                # Random seed for reproducibility
)


In [27]:
# Define WER metric
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.tensor(pred_logits), dim=-1)

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [28]:
# Initialize Trainer
trainer = Trainer(
    model=model,                               # Wav2Vec2 model
    data_collator=data_collator,               # Data Collator
    args=training_args,                        # Training arguments
    train_dataset=train_dataset,               # Training dataset
    eval_dataset=val_dataset,                  # Validation dataset
    processing_class=processor,                # Processor
    compute_metrics=compute_metrics,           # WER metric
)

In [29]:
# Start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmihailo-radovic11[0m ([33mmihailo-radovic11-student[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Wer
50,34.7862,19.011425,1.0
100,12.4882,3.884096,1.0
150,5.675,3.041412,1.0
200,5.497,2.966755,1.0
250,5.4038,3.003995,1.0
300,4.8829,2.615735,1.0
350,4.365,2.407569,1.0
400,3.8302,1.990558,1.0
450,3.2156,1.654664,1.0
500,2.6124,0.77146,0.666667


TrainOutput(global_step=1200, training_loss=4.505651850004991, metrics={'train_runtime': 1578.9943, 'train_samples_per_second': 12.54, 'train_steps_per_second': 0.76, 'total_flos': 7.602603206275048e+17, 'train_loss': 4.505651850004991, 'epoch': 171.46153846153845})

In [30]:
final_metrics = trainer.evaluate(val_dataset)
print(final_metrics)

{'eval_loss': 0.11185392737388611, 'eval_wer': 0.1111111111111111, 'eval_runtime': 0.6562, 'eval_samples_per_second': 13.716, 'eval_steps_per_second': 3.048, 'epoch': 171.46153846153845}


In [51]:
model.save_pretrained(f"{MODELS_DIR}/best_model")
processor.save_pretrained(f"{MODELS_DIR}/best_model_processor")

[]