In [1]:
import os
import pandas as pd
import torch
from datasets import Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


DATA_DIR = '../data/ravdess/train'  # Cambia a tu ruta

files = [f for f in os.listdir(DATA_DIR) if f.endswith('.wav')]

# Map de IDs a emociones (igual que tú definiste)
id2emotion = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprise'
}

# Construir dataframe con rutas y etiquetas
data = []
for f in files:
    parts = f.split('-')
    emotion_id = parts[2]
    emotion = id2emotion.get(emotion_id)
    if emotion:
        data.append({
            'path': os.path.join(DATA_DIR, f),
            'label': emotion
        })

df = pd.DataFrame(data)

# Convertir a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Cargar audio con Hugging Face Dataset
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))  # Ajusta sample rate si es otro


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

def preprocess(batch):
    audio = batch["path"]["array"]
    inputs = processor(audio, sampling_rate=16000, padding=True, return_tensors=None)
    batch["input_values"] = inputs["input_values"][0]  # es un np.array de floats
    return batch



dataset = dataset.map(preprocess)


Map: 100%|██████████| 792/792 [00:53<00:00, 14.85 examples/s] 


In [3]:
labels = sorted(list(set(df['label'])))
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

# Dataset label mapping
def label_to_id(batch):
    batch["labels"] = label2id[batch["label"]]
    return batch

dataset = dataset.map(label_to_id)
dataset = dataset.map(preprocess, remove_columns=["path", "label"])



Map: 100%|██████████| 792/792 [00:00<00:00, 4173.03 examples/s]
Map: 100%|██████████| 792/792 [00:00<00:00, 1013.21 examples/s]


In [4]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-ravdess-finetuned",
    per_device_train_batch_size=8,
    eval_strategy="steps",   # cambiar a 'eval_strategy' para evitar warning
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    num_train_epochs=5,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def data_collator(features):
    # features es una lista de dicts con 'input_values' y 'labels'
    input_values = [f["input_values"] for f in features]
    labels = [f["labels"] for f in features]

    # Usamos processor.pad para que haga padding y retorne tensor
    batch = processor.pad(
        {"input_values": input_values},
        padding=True,
        return_tensors="pt"
    )
    batch["labels"] = torch.tensor(labels, dtype=torch.long)
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtuesta-lx[0m ([33mtuesta-lx-moscow-institute-of-physics-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
