In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import librosa

from datasets import Dataset
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)

import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: NVIDIA GeForce RTX 3050


In [3]:
BASE_PATH = "../data/clean/LibriSpeech/train-clean-100"

data = []

for root, dirs, files in os.walk(BASE_PATH):
    for file in files:
        if file.endswith(".trans.txt"):
            trans_path = os.path.join(root, file)
            with open(trans_path, "r") as f:
                lines = f.readlines()
            
            for line in lines:
                parts = line.strip().split(" ", 1)
                file_id = parts[0]
                text = parts[1].lower()
                audio_path = os.path.join(root, file_id + ".flac")
                
                if os.path.exists(audio_path):
                    data.append({
                        "audio_path": audio_path,
                        "text": text
                    })

df = pd.DataFrame(data)
print("Total samples:", len(df))

Total samples: 28539


In [4]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z ']", "", text)
    return text

df["clean_text"] = df["text"].apply(normalize_text)

In [5]:
df_train = df.sample(2000, random_state=42).reset_index(drop=True)

print("Training samples:", len(df_train))

Training samples: 2000


In [6]:
TARGET_SR = 16000

def load_audio(path):
    y, sr = librosa.load(path, sr=None)
    
    if sr != TARGET_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
    
    return y.astype("float32")

In [7]:
noise_files = []

noise_root = "../data/musan/noise"

for root, dirs, files in os.walk(noise_root):
    for file in files:
        if file.endswith(".wav"):
            noise_files.append(os.path.join(root, file))

print("Noise files:", len(noise_files))

Noise files: 930


In [8]:
def add_noise(clean, noise, snr_db):
    
    if len(noise) < len(clean):
        repeat = int(np.ceil(len(clean)/len(noise)))
        noise = np.tile(noise, repeat)
    
    noise = noise[:len(clean)]
    
    clean_power = np.mean(clean**2)
    noise_power = np.mean(noise**2)
    
    snr = 10**(snr_db/10)
    scale = np.sqrt(clean_power/(snr*noise_power))
    
    noisy = clean + scale*noise
    
    return noisy

In [9]:
TRAIN_MODE = "clean"  

In [10]:
processor = Wav2Vec2Processor.from_pretrained(
    "facebook/wav2vec2-base-960h"
)



In [13]:
def prepare_dataset(example):
    
    # Load audio
    audio = load_audio(example["audio_path"])
    
    if TRAIN_MODE == "noisy":
        noise_audio = load_audio(random.choice(noise_files))
        audio = add_noise(audio, noise_audio, 0)
    
    # Process audio
    inputs = processor(audio, sampling_rate=16000)
    
    # Process text (NEW METHOD)
    labels = processor(text=example["clean_text"]).input_ids
    
    example["input_values"] = inputs.input_values[0]
    example["labels"] = labels
    
    return example

In [14]:
dataset = Dataset.from_pandas(df_train)

dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset.column_names
)

Map: 100%|██████████| 2000/2000 [01:00<00:00, 33.06 examples/s]


In [15]:
dataset = dataset.train_test_split(test_size=0.1)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [16]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h"
)

model.to("cuda")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 212/212 [00:00<00:00, 569.62it/s, Materializing param=wav2vec2.feature_projection.projection.weight]                         
[1mWav2Vec2ForCTC LOAD REPORT[0m from: facebook/wav2vec2-base-960h
Key                        | Status  | 
---------------------------+---------+-
wav2vec2.masked_spec_embed | MISSING | 

[3mNotes:
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder)

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    num_train_epochs=5,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    learning_rate=1e-4,
    warmup_steps=200,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}