In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa
!pip install wandb

In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


In [None]:
%env WANDB_WATCH=all
%env WANDB_LOG_MODEL=1
%env WANDB_PROJECT=emotion
!wandb login 935fa41b85cc2d549168b9ad8dc3e7c5b41f5aa2 --relogin

env: WANDB_WATCH=all
env: WANDB_LOG_MODEL=1
env: WANDB_PROJECT=emotion
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Since the dataset is hosted on [Mega](https://mega.nz/#F!0ShVXY7C!-73kVoK05OjTPEA95UUvMw) and unable to access it directly, we upload the dataset on the G-drive to make this tutorial accessible.

In [None]:
# Download the dataset from 

!mkdir -p /content/data
!gdown https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1
!unzip -q aesdd.zip -d /content/data/
!mv "/content/data/Acted Emotional Speech Dynamic Database/" /content/data/aesdd/

Downloading...
From: https://drive.google.com/uc?id=1_IAWexEWpH-ly_JaA5EGfZDp-_3flkN1
To: /content/aesdd.zip
100% 410M/410M [00:01<00:00, 266MB/s]


In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [None]:
data = []

for path in tqdm(Path("/content/data/aesdd").glob("**/*.wav")):
    name = str(path).split('/')[-1].split('.')[0]
    label = str(path).split('/')[-2]
    
    try:
        # There are some broken files
        s = torchaudio.load(path)
        data.append({
            "name": name,
            "path": path,
            "emotion": label
        })
    except Exception as e:
        # print(str(path), e)
        pass

    # break

605it [00:01, 488.66it/s]


In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,path,emotion
0,d15 (6),/content/data/aesdd/disgust/d15 (6).wav,disgust
1,d12 (3),/content/data/aesdd/disgust/d12 (3).wav,disgust
2,d15 (2),/content/data/aesdd/disgust/d15 (2).wav,disgust
3,d15 (5),/content/data/aesdd/disgust/d15 (5).wav,disgust
4,d07 (6),/content/data/aesdd/disgust/d07 (6).wav,disgust


In [None]:
# Filter broken and non-existed paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", 1)
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 604
Step 1: 604


  import sys


Unnamed: 0,name,path,emotion
0,f17 (5),/content/data/aesdd/fear/f17 (5).wav,fear
1,s14 (1),/content/data/aesdd/sadness/s14 (1).wav,sadness
2,a20 (6),/content/data/aesdd/anger/a20 (6).wav,anger
3,a05 (2),/content/data/aesdd/anger/a05 (2).wav,anger
4,s12 (4),/content/data/aesdd/sadness/s12 (4).wav,sadness


Let's explore how many labels (emotions) are in the dataset with what distribution.

In [None]:
print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['fear' 'sadness' 'anger' 'disgust' 'happiness']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
anger,121
disgust,122
fear,120
happiness,119
sadness,122


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


Let's display some random sample of the dataset and run it a couple of times to get a feeling for the audio and the emotional label.

In [None]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["emotion"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 338
      Label: disgust



For training purposes, we need to split data into train test sets; in this specific example, we break with a `20%` rate for the test set.

In [None]:
save_path = "/content/data"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(483, 3)
(121, 3)


In [None]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "/content/data/train.csv", 
    "validation": "/content/data/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-181f3a7434038dba


Downloading and preparing dataset csv/default to /content/cache/csv/default-181f3a7434038dba/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-181f3a7434038dba/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 483
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 121
})


In [None]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 5 classes: ['anger', 'disgust', 'fear', 'happiness', 'sadness']


In [None]:
from transformers import AutoConfig, Wav2Vec2Processor

In [None]:
model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
pooling_mode = "mean"

In [None]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The target sampling rate: 16000


In [None]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)



      

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


In [None]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

Training input_values: [-0.001266996841877699, -0.00154140405356884, 0.003161875531077385, 0.014162782579660416, 0.016951069235801697, 0.015057043172419071, 0.008766135200858116, 0.002521006390452385, 0.0020501259714365005, -0.005390508566051722, -0.009255951270461082, -0.00812871940433979, -0.0034364708699285984, -6.974794814595953e-05, -0.0030981118325144053, 0.006851061247289181, 0.008430799469351768, 0.007833120413124561, 0.002445405349135399, 0.006440452765673399, 0.014543121680617332, 0.01566118374466896, 0.017992254346609116, 0.014073661528527737, 0.01481484156101942, 0.007049076724797487, 0.007393388543277979, 0.0019302578875795007, -0.002551536774262786, 0.0004153390182182193, 0.004165200982242823, 0.017583509907126427, 0.02067394182085991, 0.027732670307159424, 0.04205505549907684, 0.05190752074122429, 0.053663287311792374, 0.036304403096437454, 0.03588796779513359, 0.032690390944480896, 0.027473948895931244, 0.015016062185168266, 0.005197296384721994, 0.019222356379032135, 0

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [None]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
is_regression = False

In [None]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at lighteternal/wav2vec2-large-xlsr-53-greek were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at lighteternal/wav2vec2-large-xlsr-53-greek and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a d

In [None]:
model.freeze_feature_extractor()

In [None]:
# from google.colab import drive

# drive.mount('/gdrive')

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/wav2vec2-xlsr-greek-speech-emotion-recognition",
    # output_dir="/content/gdrive/MyDrive/wav2vec2-xlsr-greek-speech-emotion-recognition"
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

In [None]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


In [None]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, emotion, name. If path, emotion, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 483
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 60
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmahsanghani[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
10,1.6679,1.599206,0.239669
20,1.6137,1.568456,0.206612
30,1.5848,1.524051,0.322314
40,1.5363,1.442625,0.371901
50,1.5065,1.408252,0.413223
60,1.4229,1.382966,0.438017


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, emotion, name. If path, emotion, name are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 121
  Batch size = 4
Saving model checkpoint to /content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-10
Configuration saved in /content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-10/config.json
Model weights saved in /content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-10/pytorch_model.bin
Feature extractor saved in /content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-10/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, emotion, name. If path, emotion, name are not expe

TrainOutput(global_step=60, training_loss=1.5553558349609375, metrics={'train_runtime': 279.1314, 'train_samples_per_second': 1.73, 'train_steps_per_second': 0.215, 'total_flos': 8.562477466108512e+16, 'train_loss': 1.5553558349609375, 'epoch': 0.99})

In [None]:
import librosa
from sklearn.metrics import classification_report

In [None]:
test_dataset = load_dataset("csv", data_files={"test": "/content/data/test.csv"}, delimiter="\t")["test"]
test_dataset

Using custom data configuration default-f103bb2e0a78a266


Downloading and preparing dataset csv/default to /content/cache/csv/default-f103bb2e0a78a266/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-f103bb2e0a78a266/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 121
})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [None]:
model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/config.json not found in cache or force_download set to True, downloading to /content/cache/tmpwvhkphoa


Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/config.json in cache at /content/cache/b309dcbb5c59e76568ab508f0d64d9c7716c48154f59e90b594b9b4b10abd534.192f95a0e7d34acbd48bad817fc7c5a80a6f840a97064a395f6bfa3fc0a32761
creating metadata file for /content/cache/b309dcbb5c59e76568ab508f0d64d9c7716c48154f59e90b594b9b4b10abd534.192f95a0e7d34acbd48bad817fc7c5a80a6f840a97064a395f6bfa3fc0a32761
loading configuration file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/config.json from cache at /content/cache/b309dcbb5c59e76568ab508f0d64d9c7716c48154f59e90b594b9b4b10abd534.192f95a0e7d34acbd48bad817fc7c5a80a6f840a97064a395f6bfa3fc0a32761
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config Wav2Vec2Config {
  "_name_or_path": "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition",
  "activation_dropout": 0.0,
  "adapter_kernel_size":

Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/preprocessor_config.json in cache at /content/cache/0aaa152c34c1e810936136ee1d71b3e21140db822b3a9e6d0d249aa3929b6d8f.bbc1eb890a39c82e710a893223b8452ac5b78e8b57083b2f893aa7dc59d4ed69
creating metadata file for /content/cache/0aaa152c34c1e810936136ee1d71b3e21140db822b3a9e6d0d249aa3929b6d8f.bbc1eb890a39c82e710a893223b8452ac5b78e8b57083b2f893aa7dc59d4ed69
loading feature extractor configuration file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/preprocessor_config.json from cache at /content/cache/0aaa152c34c1e810936136ee1d71b3e21140db822b3a9e6d0d249aa3929b6d8f.bbc1eb890a39c82e710a893223b8452ac5b78e8b57083b2f893aa7dc59d4ed69
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask":

Downloading:   0%|          | 0.00/398 [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/tokenizer_config.json in cache at /content/cache/dd6c65f34599dd04f3b305d86bac96ebf2892787f905cb19b992396ee74e5399.a380b504a42072d8506967ae80eb3abf2c105071298ac6b2e9825dc10c5f74d2
creating metadata file for /content/cache/dd6c65f34599dd04f3b305d86bac96ebf2892787f905cb19b992396ee74e5399.a380b504a42072d8506967ae80eb3abf2c105071298ac6b2e9825dc10c5f74d2
loading configuration file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/config.json from cache at /content/cache/b309dcbb5c59e76568ab508f0d64d9c7716c48154f59e90b594b9b4b10abd534.192f95a0e7d34acbd48bad817fc7c5a80a6f840a97064a395f6bfa3fc0a32761
Model config Wav2Vec2Config {
  "_name_or_path": "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectur

Downloading:   0%|          | 0.00/535 [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/vocab.json in cache at /content/cache/ce8f2b66c3b01cafd1d6593cfa9c934affa9b9ad361af2276fabf17be0f31a5c.a11c2e5b4108d6a907796d8564e25e383523d05f52496860a28f529bb9091900
creating metadata file for /content/cache/ce8f2b66c3b01cafd1d6593cfa9c934affa9b9ad361af2276fabf17be0f31a5c.a11c2e5b4108d6a907796d8564e25e383523d05f52496860a28f529bb9091900
https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/added_tokens.json not found in cache or force_download set to True, downloading to /content/cache/tmpwhphnd94


Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/added_tokens.json in cache at /content/cache/6de03252f170cebe8610d9138a20cf53a023126e717da88a381fb3f4d07c5489.c7fdfe4dc505ce376bd5b04399f99c13aacad3c956ca61f8eab629afe0761ce8
creating metadata file for /content/cache/6de03252f170cebe8610d9138a20cf53a023126e717da88a381fb3f4d07c5489.c7fdfe4dc505ce376bd5b04399f99c13aacad3c956ca61f8eab629afe0761ce8
https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /content/cache/tmp9sbq9d0h


Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/special_tokens_map.json in cache at /content/cache/2cdce78d6395e4c39f99a12f4cd476ab9e3c63b158cd09b36f8620da8186c359.52ff36a8492158dc5f514fcf8fa507ed5251bf82f2ab8108bbbcdc91ce4f4ac9
creating metadata file for /content/cache/2cdce78d6395e4c39f99a12f4cd476ab9e3c63b158cd09b36f8620da8186c359.52ff36a8492158dc5f514fcf8fa507ed5251bf82f2ab8108bbbcdc91ce4f4ac9
loading file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/vocab.json from cache at /content/cache/ce8f2b66c3b01cafd1d6593cfa9c934affa9b9ad361af2276fabf17be0f31a5c.a11c2e5b4108d6a907796d8564e25e383523d05f52496860a28f529bb9091900
loading file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/tokenizer_config.json from cache at /content/cache/dd6c65f34599dd04f3b305d86bac96ebf2892787f905cb19b992396ee74e5399.a380b504a42072d8506967ae80eb3abf2c105071298ac6b2

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

storing https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/pytorch_model.bin in cache at /content/cache/c5ff3a6c4d9dab5ed659bf3855d37d27b912dae1ccaf894ef6d56ad89fe80483.fd32085530924f4f216b93c89e9ddcf8eacf4abea77572bd9ef53453202cf95c
creating metadata file for /content/cache/c5ff3a6c4d9dab5ed659bf3855d37d27b912dae1ccaf894ef6d56ad89fe80483.fd32085530924f4f216b93c89e9ddcf8eacf4abea77572bd9ef53453202cf95c
loading weights file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/pytorch_model.bin from cache at /content/cache/c5ff3a6c4d9dab5ed659bf3855d37d27b912dae1ccaf894ef6d56ad89fe80483.fd32085530924f4f216b93c89e9ddcf8eacf4abea77572bd9ef53453202cf95c
All model checkpoint weights were used when initializing Wav2Vec2ForSpeechClassification.

All the weights of Wav2Vec2ForSpeechClassification were initialized from the model checkpoint at m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition.
If your task is

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits 

    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [None]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

  0%|          | 0/121 [00:00<?, ?ex/s]

In [None]:
result = test_dataset.map(predict, batched=True, batch_size=8)

  0%|          | 0/16 [00:00<?, ?ba/s]

In [None]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

['anger', 'disgust', 'fear', 'happiness', 'sadness']

In [None]:
y_true = [config.label2id[name] for name in result["emotion"]]
y_pred = result["predicted"]

print(y_true[:5])
print(y_pred[:5])

[1, 4, 0, 4, 3]
[1, 4, 0, 4, 3]


In [None]:
print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       anger       1.00      1.00      1.00        24
     disgust       0.96      1.00      0.98        24
        fear       1.00      1.00      1.00        24
   happiness       1.00      0.96      0.98        24
     sadness       1.00      1.00      1.00        25

    accuracy                           0.99       121
   macro avg       0.99      0.99      0.99       121
weighted avg       0.99      0.99      0.99       121



# Prediction

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
sampling_rate = processor.feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

loading configuration file https://huggingface.co/m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition/resolve/main/config.json from cache at /content/cache/b309dcbb5c59e76568ab508f0d64d9c7716c48154f59e90b594b9b4b10abd534.192f95a0e7d34acbd48bad817fc7c5a80a6f840a97064a395f6bfa3fc0a32761
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config Wav2Vec2Config {
  "_name_or_path": "m3hrdadfi/wav2vec2-xlsr-greek-speech-emotion-recognition",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForSpeechClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,

In [None]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs


STYLES = """
<style>
div.display_data {
    margin: 0 auto;
    max-width: 500px;
}
table.xxx {
    margin: 50px !important;
    float: right !important;
    clear: both !important;
}
table.xxx td {
    min-width: 300px !important;
    text-align: center !important;
}
</style>
""".strip()

def prediction(df_row):
    path, emotion = df_row["path"], df_row["emotion"]
    df = pd.DataFrame([{"Emotion": emotion, "Sentence": "    "}])
    setup = {
        'border': 2,
        'show_dimensions': True,
        'justify': 'center',
        'classes': 'xxx',
        'escape': False,
    }
    ipd.display(ipd.HTML(STYLES + df.to_html(**setup) + "<br />"))
    speech, sr = torchaudio.load(path)
    speech = speech[0].numpy().squeeze()
    speech = librosa.resample(np.asarray(speech), sr, sampling_rate)
    ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=sampling_rate))

    outputs = predict(path, sampling_rate)
    r = pd.DataFrame(outputs)
    ipd.display(ipd.HTML(STYLES + r.to_html(**setup) + "<br />"))

In [None]:
test = pd.read_csv("/content/data/test.csv", sep="\t")
test.head()

Unnamed: 0,name,path,emotion
0,d20 (4),/content/data/aesdd/disgust/d20 (4).wav,disgust
1,s13 (1),/content/data/aesdd/sadness/s13 (1).wav,sadness
2,a10 (2),/content/data/aesdd/anger/a10 (2).wav,anger
3,s09 (3),/content/data/aesdd/sadness/s09 (3).wav,sadness
4,h06 (6),/content/data/aesdd/happiness/h06 (6).wav,happiness


In [None]:
prediction(test.iloc[0])

Unnamed: 0,Emotion,Sentence
0,disgust,


Unnamed: 0,Emotion,Score
0,anger,0.0%
1,disgust,99.6%
2,fear,0.1%
3,happiness,0.1%
4,sadness,0.2%


In [None]:
prediction(test.iloc[1])

Unnamed: 0,Emotion,Sentence
0,sadness,


Unnamed: 0,Emotion,Score
0,anger,0.5%
1,disgust,0.2%
2,fear,0.1%
3,happiness,0.0%
4,sadness,99.2%


In [None]:
prediction(test.iloc[2])

Unnamed: 0,Emotion,Sentence
0,anger,


Unnamed: 0,Emotion,Score
0,anger,99.3%
1,disgust,0.0%
2,fear,0.1%
3,happiness,0.2%
4,sadness,0.3%
