In [1]:
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d uldisvalainis/audio-emotions
!unzip audio-emotions.zip -d audio_emotions

Saving kaggle.json to kaggle (1).json
Dataset URL: https://www.kaggle.com/datasets/uldisvalainis/audio-emotions
License(s): unknown
audio-emotions.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  audio-emotions.zip
replace audio_emotions/Emotions/Angry/03-01-05-01-01-01-01.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
N
A


In [2]:
#dividing into test train

import os
import pandas as pd

root = "/content/audio_emotions/Emotions"  # change to your path

rows = []
for emotion in sorted(os.listdir(root)):
    emotion_dir = os.path.join(root, emotion)
    if not os.path.isdir(emotion_dir):
        continue
    for fname in os.listdir(emotion_dir):
        if not fname.lower().endswith((".wav", ".flac", ".mp3")):
            continue
        rows.append({
            "filepath": os.path.join(emotion_dir, fname),
            "label": emotion
        })

df = pd.DataFrame(rows)
df.to_csv("all_audio_with_labels.csv", index=False)


In [3]:
from sklearn.model_selection import train_test_split

# 80% train, 20% temp (val+test)
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)

# split temp into 50/50 -> 10% val, 10% test overall
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42
)

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)


In [4]:

!pip install transformers datasets soundfile librosa



In [5]:
import torch
import librosa
import numpy as np
import pandas as pd

from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Config
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "superb/hubert-large-superb-er"   # pretrained SER model
TARGET_SR = 16000

# 2. Load model + feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# 3. Load test split
test_df = pd.read_csv("test.csv")   # must contain columns: filepath, label

label2idx = {lab: i for i, lab in enumerate(sorted(test_df["label"].unique()))}
idx2label = {i: lab for lab, i in label2idx.items()}
y_true = np.array([label2idx[l] for l in test_df["label"]])

# 4. Inference helper
def predict_file(path):
    # Load mono audio at 16 kHz
    speech, sr = librosa.load(path, sr=TARGET_SR)

    # No truncation flag here; let the extractor handle it
    inputs = feature_extractor(
        speech,
        sampling_rate=TARGET_SR,
        return_tensors="pt",
        padding=True,
        # truncation=False  # default; do not set truncation=True without max_length
    )
    input_values = inputs["input_values"].to(DEVICE)

    with torch.no_grad():
        outputs = model(input_values)
        logits = outputs.logits
    pred_idx = int(logits.argmax(dim=-1).cpu().item())
    return pred_idx

# 5. Run zero-shot evaluation
y_pred = []
for path in test_df["filepath"]:
    y_pred.append(predict_file(path))

y_pred = np.array(y_pred)

acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

print("Zero-shot evaluation (pretrained model, no fine-tuning)")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")


KeyboardInterrupt: 

In [5]:
#2. train/fine tune AM model

In [6]:
!pip install transformers soundfile librosa scikit-learn --quiet


In [2]:
import torch
import librosa
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "superb/hubert-large-superb-er"
TARGET_SR = 16000
BATCH_SIZE = 8
EPOCHS = 10
LR = 1e-5


In [3]:
train_df = pd.read_csv("train.csv")
val_df   = pd.read_csv("val.csv")
test_df  = pd.read_csv("test.csv")

all_labels = sorted(pd.concat([train_df["label"], val_df["label"], test_df["label"]]).unique())
label2idx = {lab: i for i, lab in enumerate(all_labels)}
idx2label = {i: lab for lab, i in label2idx.items()}

train_df["y"] = train_df["label"].map(label2idx)
val_df["y"]   = val_df["label"].map(label2idx)
test_df["y"]  = test_df["label"].map(label2idx)


In [4]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(all_labels),        # adapt head to your label count
    ignore_mismatched_sizes=True,      # replace old classification layer
).to(DEVICE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-large-superb-er and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class SERDataset(Dataset):
    def __init__(self, df, sr=TARGET_SR):
        self.df = df.reset_index(drop=True)
        self.sr = sr

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav_path = row["filepath"]
        label = int(row["y"])
        speech, sr = librosa.load(wav_path, sr=self.sr)
        return speech, label

def collate_fn(batch):
    speeches = [b[0] for b in batch]
    labels   = torch.tensor([b[1] for b in batch], dtype=torch.long)
    inputs = feature_extractor(
        speeches,
        sampling_rate=TARGET_SR,
        return_tensors="pt",
        padding=True,
    )
    input_values = inputs["input_values"]
    return input_values, labels

train_ds = SERDataset(train_df)
val_ds   = SERDataset(val_df)
test_ds  = SERDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=collate_fn)


In [None]:
import torch

# Force CUDA if available
if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available, cannot force GPU.")
DEVICE = torch.device("cuda")  # or torch.device("cuda:0")

model = model.to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss().to(DEVICE)


def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    total_loss, correct, total = 0.0, 0, 0

    for inputs, labels in loader:
        # move batch to GPU
        inputs = inputs.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            outputs = model(inputs)
            logits = outputs.logits
            loss = criterion(logits, labels)
            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / total, correct / total


best_val_acc = 0.0
for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_acc = run_epoch(val_loader, train=False)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_finetuned_ser.pth")
    print(
        f"Epoch {epoch:02d} | "
        f"train_loss={train_loss:.3f} acc={train_acc:.3f} | "
        f"val_loss={val_loss:.3f} acc={val_acc:.3f}"
    )


Epoch 01 | train_loss=1.469 acc=0.434 | val_loss=0.938 acc=0.640
Epoch 02 | train_loss=0.863 acc=0.685 | val_loss=0.733 acc=0.728
Epoch 03 | train_loss=0.712 acc=0.737 | val_loss=0.681 acc=0.745
Epoch 04 | train_loss=0.617 acc=0.774 | val_loss=0.606 acc=0.793
Epoch 05 | train_loss=0.543 acc=0.802 | val_loss=0.596 acc=0.791


In [None]:
model.load_state_dict(torch.load("best_finetuned_ser.pth", map_location=DEVICE))
model.eval()

all_true, all_pred = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        outputs = model(inputs)
        logits = outputs.logits
        preds = logits.argmax(dim=-1)
        all_true.extend(labels.cpu().numpy())
        all_pred.extend(preds.cpu().numpy())

all_true = np.array(all_true)
all_pred = np.array(all_pred)

acc = accuracy_score(all_true, all_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    all_true, all_pred, average="weighted", zero_division=0
)

print("Fine-tuned model evaluation on test set")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")
