In [4]:
import os
import pandas as pd
import torch
import torchaudio
from datasets import Dataset
from transformers import (
    AutoProcessor, 
    AutoModelForAudioClassification, 
    TrainingArguments, 
    Trainer
)


In [5]:
def extract_emotion(filename):
    code = int(filename.split("-")[2])
    emotion_map = {
        1: "neutral",
        2: "calm",
        3: "happy",
        4: "sad",
        5: "angry",
        6: "fearful",
        7: "disgust",
        8: "surprised"
    }
    return emotion_map[code]

audio_root = "/kaggle/input/ravdess-emotional-speech-audio"

data = []
for actor_dir in sorted(os.listdir(audio_root)):
    actor_path = os.path.join(audio_root, actor_dir)
    if os.path.isdir(actor_path):
        for fname in os.listdir(actor_path):
            if fname.endswith(".wav"):
                full_path = os.path.join(actor_path, fname)
                emotion = extract_emotion(fname)
                data.append({"path": full_path, "label": emotion})

df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(10)


Unnamed: 0,path,label
0,/kaggle/input/ravdess-emotional-speech-audio/A...,fearful
1,/kaggle/input/ravdess-emotional-speech-audio/A...,surprised
2,/kaggle/input/ravdess-emotional-speech-audio/A...,sad
3,/kaggle/input/ravdess-emotional-speech-audio/A...,fearful
4,/kaggle/input/ravdess-emotional-speech-audio/A...,sad
5,/kaggle/input/ravdess-emotional-speech-audio/A...,disgust
6,/kaggle/input/ravdess-emotional-speech-audio/A...,neutral
7,/kaggle/input/ravdess-emotional-speech-audio/A...,calm
8,/kaggle/input/ravdess-emotional-speech-audio/A...,disgust
9,/kaggle/input/ravdess-emotional-speech-audio/A...,calm


In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.class_encode_column("label")
dataset = dataset.train_test_split(test_size=0.2, seed=42)

dataset


Casting to class labels:   0%|          | 0/1440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['path', 'label'],
        num_rows: 1152
    })
    test: Dataset({
        features: ['path', 'label'],
        num_rows: 288
    })
})

In [7]:
import torchaudio
import torchaudio.transforms as T

def preprocess(example):
    try:
        waveform, sr = torchaudio.load(example["path"])
        waveform = waveform.mean(dim=0)  
        if sr != 16000:
            resampler = T.Resample(orig_freq=sr, new_freq=16000)
            waveform = resampler(waveform)
        
        input_values = waveform.tolist()
        
       
        if not isinstance(input_values, list):
            raise ValueError("Not a list")

        return {"input_values": input_values, "label": example["label"]}
    
    except Exception as e:
        
        return {"input_values": None, "label": None}
dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

dataset = dataset.filter(lambda example: example["input_values"] is not None)




Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/288 [00:00<?, ? examples/s]

In [11]:
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import (
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer
)
import time
import numpy as np
import evaluate



class CustomAudioCollator:
    def __init__(self, padding_value=0.0):
        self.padding_value = padding_value

    def __call__(self, features):
        input_values = [torch.tensor(f["input_values"]) if not isinstance(f["input_values"], torch.Tensor) else f["input_values"] for f in features]
        labels = torch.tensor([f["label"] for f in features])
        padded_inputs = pad_sequence(input_values, batch_first=True, padding_value=self.padding_value)
        return {"input_values": padded_inputs, "labels": labels}


model = AutoModelForAudioClassification.from_pretrained(
    "superb/wav2vec2-base-superb-er",
    num_labels=dataset["train"].features["label"].num_classes,
    ignore_mismatched_sizes=True
)



training_args = TrainingArguments(
    output_dir="./wav2vec2_ravdess",
    do_train=True,
    do_eval=True,      
    logging_strategy="epoch",         
    logging_dir="./logs",
    disable_tqdm=False,              
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
    report_to="none"
)



accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        logits, labels = eval_pred
    else:
        logits, labels = eval_pred.predictions, eval_pred.label_ids

    if isinstance(logits, tuple):
        logits = logits[0]

    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)



collator = CustomAudioCollator()
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collator,
    compute_metrics=compute_metrics,
)


start_train = time.time()
train_result = trainer.train()
end_train = time.time()
train_time = end_train - start_train

print(f"\nToplam eğitim süresi: {train_time:.2f} saniye")
print(f"Eğitim doğruluğu: {train_result.metrics.get('train_accuracy', 'Eğitim loglarında')}")



start_test = time.time()
test_metrics = trainer.evaluate()
end_test = time.time()
test_time = end_test - start_test

print(f"Test doğruluğu: {test_metrics['eval_accuracy']:.4f}")
print(f"Toplam test süresi: {test_time:.2f} saniye")


model.save_pretrained("./wav2vec2_ravdess/final_model")


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
72,1.9982
144,1.7238
216,1.5583
288,1.3762
360,1.2117
432,1.1153
504,1.0054
576,0.8497
648,0.7754
720,0.6422



Toplam eğitim süresi: 1776.22 saniye
Eğitim doğruluğu: Eğitim loglarında


Test doğruluğu: 0.8681
Toplam test süresi: 15.43 saniye
