In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import torch
import librosa
from datasets import Dataset
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)
import evaluate

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
GPU: NVIDIA GeForce RTX 3050


In [2]:
BASE_PATH = "../data/clean/LibriSpeech/train-clean-100"

data = []

for root, dirs, files in os.walk(BASE_PATH):
    for file in files:
        if file.endswith(".trans.txt"):
            trans_path = os.path.join(root, file)

            with open(trans_path, "r") as f:
                lines = f.readlines()

            for line in lines:
                parts = line.strip().split(" ", 1)
                file_id = parts[0]
                text = parts[1].lower()
                audio_path = os.path.join(root, file_id + ".flac")

                if os.path.exists(audio_path):
                    data.append({
                        "audio_path": audio_path,
                        "text": text
                    })

df = pd.DataFrame(data)
print("Total samples:", len(df))

Total samples: 28539


In [3]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z ']", "", text)
    return text

df["clean_text"] = df["text"].apply(normalize_text)

df_train = df.sample(5000, random_state=42).reset_index(drop=True)

print("Training samples:", len(df_train))

Training samples: 5000


In [4]:
TARGET_SR = 16000

def load_audio(path):
    y, sr = librosa.load(path, sr=None)

    if sr != TARGET_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)

    return y.astype("float32")

noise_files = []
noise_root = "../data/musan/noise"

for root, dirs, files in os.walk(noise_root):
    for file in files:
        if file.endswith(".wav"):
            noise_files.append(os.path.join(root, file))

print("Noise files:", len(noise_files))

Noise files: 930


In [5]:
def add_noise(clean, noise, snr_db):
    if len(noise) < len(clean):
        repeat = int(np.ceil(len(clean) / len(noise)))
        noise = np.tile(noise, repeat)

    noise = noise[:len(clean)]

    clean_power = np.mean(clean ** 2)
    noise_power = np.mean(noise ** 2)

    snr = 10 ** (snr_db / 10)
    scale = np.sqrt(clean_power / (snr * noise_power))

    noisy = clean + scale * noise
    return noisy

In [6]:
processor = Wav2Vec2Processor.from_pretrained(
    "facebook/wav2vec2-base-960h"
)



In [None]:
TRAIN_MODE = "clean"   # first phase

def prepare_dataset(example):

    audio = load_audio(example["audio_path"])

    if TRAIN_MODE == "noisy":
        noise_audio = load_audio(random.choice(noise_files))
        snr_db = np.random.uniform(0, 20)
        audio = add_noise(audio, noise_audio, snr_db)

    inputs = processor(audio, sampling_rate=16000)

    labels = processor(text=example["clean_text"].upper()).input_ids

    example["input_values"] = inputs.input_values[0]
    example["labels"] = labels

    return example

dataset = Dataset.from_pandas(df_train)

dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset.column_names
)

dataset = dataset.train_test_split(test_size=0.1)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

Map:  29%|██▉       | 1467/5000 [00:15<00:28, 126.08 examples/s]

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h"
)

for param in model.wav2vec2.feature_extractor.parameters():
    param.requires_grad = False

model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):

        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels = labels_batch["input_ids"]
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor)

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)

    pred_str = processor.batch_decode(pred_ids, group_tokens=True)

    label_ids = pred.label_ids
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.cpu().numpy()

    label_ids = label_ids.copy()
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    label_str = processor.batch_decode(label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    num_train_epochs=5,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_steps=500,
    learning_rate=3e-5,
    warmup_steps=100,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    seed=42,
    data_seed=42,
)

In [None]:
# ==============================
# FIX PROJECT ROOT PATH
# ==============================
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

MODELS_DIR = os.path.join(PROJECT_ROOT, "models")
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results")

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print("Saving to:", PROJECT_ROOT)

In [None]:
import os
import json
from turtle import mode
from transformers import TrainingArguments

def train_model(mode):

    print(f"\n========== Training {mode.upper()} Model ==========\n")

    global TRAIN_MODE
    TRAIN_MODE = mode

    # -------- Dataset --------
    dataset = Dataset.from_pandas(df_train)

    dataset = dataset.map(
        prepare_dataset,
        remove_columns=dataset.column_names
    )

    dataset = dataset.train_test_split(test_size=0.1)

    train_dataset = dataset["train"]
    eval_dataset = dataset["test"]

    # -------- Model --------
    model = Wav2Vec2ForCTC.from_pretrained(
        "facebook/wav2vec2-base-960h"
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # -------- Create Mode-Specific Folders --------
    model_dir = os.path.join(MODELS_DIR, mode)
    os.makedirs(model_dir, exist_ok=True)

    # -------- Training Arguments (MODE-SPECIFIC) --------
    mode_training_args = TrainingArguments(
        output_dir=model_dir,        
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=5,
        fp16=True,
        save_total_limit=2,
        load_best_model_at_end=True,
    )

    # -------- Trainer --------
    trainer = Trainer(
        model=model,
        args=mode_training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    results = trainer.evaluate()
    # Save final trained model separately
    final_model_path = os.path.join(model_dir, "final_model")
    trainer.save_model(final_model_path)

    print(f"\nResults for {mode.upper()} model:")
    print(results)

    # -------- Save Metrics Properly --------
    metrics_path = os.path.join(RESULTS_DIR, f"{mode}_metrics.json")

    with open(metrics_path, "w") as f:
        json.dump(results, f, indent=4)

    return results, trainer, eval_dataset

In [None]:
clean_results, clean_trainer, clean_eval = train_model("clean")

In [None]:
import torch
import numpy as np

# Take one sample from validation set
sample = clean_eval[0]

model = clean_trainer.model
model.eval()

# Prepare input tensor
input_tensor = torch.tensor(sample["input_values"]).unsqueeze(0).to(model.device)

# Forward pass
with torch.no_grad():
    logits = model(input_tensor).logits

# Get predicted token IDs
pred_ids = torch.argmax(logits, dim=-1)

# Decode prediction and label
prediction = processor.batch_decode(pred_ids, group_tokens=True)[0]
label = processor.batch_decode(
    [sample["labels"]],
    group_tokens=False
)[0]

print("Prediction:")
print(prediction)
print("\nLabel:")
print(label)

In [None]:
noisy_results, noisy_trainer, noisy_eval = train_model("noisy")

In [None]:
import torch
import numpy as np

# Take one sample from noisy validation set
sample = noisy_eval[0]

model = noisy_trainer.model
model.eval()

# Prepare input tensor
input_tensor = torch.tensor(sample["input_values"]).unsqueeze(0).to(model.device)

# Forward pass
with torch.no_grad():
    logits = model(input_tensor).logits

# Get predicted token IDs
pred_ids = torch.argmax(logits, dim=-1)

# Decode prediction and label
prediction = processor.batch_decode(pred_ids, group_tokens=True)[0]
label = processor.batch_decode(
    [sample["labels"]],
    group_tokens=False
)[0]

print("NOISY MODEL Prediction:")
print(prediction)
print("\nActual Label:")
print(label)

In [None]:
print("Clean model:", clean_results)
print("Noisy model:", noisy_results)