# Accent Classification Model Development

In [1]:
import os
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, Audio
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import numpy as np
import librosa

  from .autonotebook import tqdm as notebook_tqdm
2025-05-10 18:22:35.423121: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 18:22:35.431344: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-10 18:22:35.440406: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-10 18:22:35.443023: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 18:22:35.4

### Device Agnostic Code

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load Base Model from Hugging Face

In [3]:
model_name = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(model_name)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.to(device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


### Data Preparation

In [5]:
accent_dirs = dict(american=0, british=1, australian=2)
id_to_label = {v: k for k, v in accent_dirs.items()}

In [6]:
model.config.id2label = id_to_label
model.config.label2id = accent_dirs

In [7]:
data = []
for accent, label in accent_dirs.items():
    directory = f"./data/{accent}/"
    for filename in os.listdir(directory):
        if filename.endswith('.mp3'):
            filepath = os.path.join(directory, filename)
            data.append({"path": filepath, "label": label})

In [8]:
# Load dataset and convert path column to audio format
dataset = Dataset.from_list(data)
dataset = dataset.cast_column("path", Audio())

### Split Train and Test Data

In [9]:
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

### Data Preprocessing

In [10]:
def preprocess(batch):
    audio = batch["path"]["array"]
    max_length = 176256
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
    
    batch["input_values"] = inputs.input_values[0]
    
    batch["labels"] = torch.tensor([batch["label"]])
    
    if "attention_mask" in inputs:
        batch["attention_mask"] = inputs.attention_mask[0]

    return batch

In [11]:
train_dataset = train_dataset.map(preprocess, remove_columns=["path"])
test_dataset = test_dataset.map(preprocess, remove_columns=["path"])

Map: 100%|█████████████████████████████████████████████████████████████████████| 360/360 [00:04<00:00, 73.35 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████| 91/91 [00:01<00:00, 89.71 examples/s]


### Set TrainArgs and Trainer

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_steps=1,        
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="none",
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

### Train the Model

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0926,1.081394
2,1.0504,1.057964
3,1.0044,1.017192
4,0.9636,0.970215
5,0.9322,0.916304
6,0.9036,0.904914
7,0.8705,0.913313
8,0.8322,0.901563
9,0.8321,0.892076
10,0.8256,0.925454


TrainOutput(global_step=230, training_loss=0.9307173936263374, metrics={'train_runtime': 2585.1993, 'train_samples_per_second': 1.393, 'train_steps_per_second': 0.089, 'total_flos': 3.600377602163712e+17, 'train_loss': 0.9307173936263374, 'epoch': 10.0})

### Evaluate Model Performance

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    report = classification_report(labels, predictions, target_names=accent_dirs.keys())
    print(report)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [16]:
metrics = trainer.evaluate()
print(metrics)

              precision    recall  f1-score   support

    american       0.33      0.09      0.15        32
     british       0.46      0.78      0.58        23
  australian       0.65      0.78      0.71        36

    accuracy                           0.54        91
   macro avg       0.48      0.55      0.48        91
weighted avg       0.49      0.54      0.48        91

{'eval_loss': 0.9254543781280518, 'eval_model_preparation_time': 0.0016, 'eval_accuracy': 0.5384615384615384, 'eval_f1': 0.4786456360518447, 'eval_precision': 0.491471557661182, 'eval_recall': 0.5384615384615384, 'eval_runtime': 10.1892, 'eval_samples_per_second': 8.931, 'eval_steps_per_second': 0.589}


### Save Model

In [18]:
model.save_pretrained("./final_model")
processor.save_pretrained("./final_model")

[]

### Load Model

In [19]:
model_path = "./final_model"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)

### Inference

In [20]:
model.eval()

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [22]:
import torch.nn.functional as F

In [25]:
audio_file = "./data/british/common_voice_en_41917501.mp3" 
waveform, sample_rate = librosa.load(audio_file, sr=16000)
inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

probs = F.softmax(logits, dim=-1).squeeze().tolist()

predicted_class = torch.argmax(logits, dim=-1).item()
predicted_prob = probs[predicted_class]

id_to_label = model.config.id2label
print(f"Predicted class: {id_to_label[predicted_class]} probability: {predicted_prob:.4f}")

Predicted class: british probability: 0.5170
