In [34]:
#Import data packages
import os
import numpy as np
import pandas as pd

#Import plotting packages
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import matplotlib.pyplot as plt
import seaborn as sns

#Import audio packages
import librosa
import librosa.display

from sklearn.metrics import confusion_matrix

from datasets import load_dataset
from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch, torchaudio

In [32]:
raw_data_path = r"C:\Users\Kin Tu\Documents\RealSER\Dataset"

dataset = load_dataset("audiofolder", data_dir=raw_data_path)
labels = dataset["train"].features["label"].names

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")



### Preprocess Data

In [23]:
def preprocess(batch):
    audio = batch["audio"]["array"]
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]
    batch["attention_mask"] = inputs.attention_mask[0]
    return batch

# Apply preprocessing
preprocessed_dataset = dataset.map(preprocess)

Map:   0%|          | 0/608 [00:00<?, ? examples/s]


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [27]:
train_test = preprocessed_dataset['train'].train_test_split(test_size=0.2)

train_dataset = train_test['train']
test_dataset = train_test['test']

train_dataset

train_dataset['audio'][:2]

[{'path': 'C:\\Users\\Kin Tu\\Documents\\RealSER\\Dataset\\Thân Thiện\\Thân Thiện_Câu 6_Quinh.wav',
  'array': array([-5.79833984e-04, -6.10351562e-05, -3.96728516e-04, ...,
         -2.92968750e-03, -2.50244141e-03, -2.86865234e-03]),
  'sampling_rate': 16000},
 {'path': 'C:\\Users\\Kin Tu\\Documents\\RealSER\\Dataset\\Cáu Giận\\Cáu Giận_Câu 8_USA.wav',
  'array': array([0.00158691, 0.00161743, 0.0017395 , ..., 0.00088501, 0.00061035,
         0.00076294]),
  'sampling_rate': 16000}]

In [None]:
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=5,  # Replace with the number of sentiment classes (e.g., positive, neutral, negative)
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./saved_models",
    num_train_epochs=5,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    weight_decay=0.01, # avoid overfitting
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    logging_dir='./logs',
    logging_steps=10,
    fp16=True  # Enable for faster training on GPUs
)

data_collator = DataCollatorWithPadding(processor.tokenizer) # padding to uniform input size

In [None]:
# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train() 

In [58]:
metrics = trainer.evaluate()
print(metrics)

(608, 1)

In [None]:
model.save_pretrained("./saved_model")

In [None]:
def predict(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    inputs = processor(waveform[0].numpy(), sampling_rate=sample_rate, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    probabilities = torch.sigmoid(logits)
    predictions = probabilities.int()
    
    return predictions, probabilities

# Example usage
predicts, probs = predict("/path/to/audio.wav")

for pre, prob in (predicts, probs):
    print(f"{labels[pre]}: {prob}")