https://github.com/CheyneyComputerScience/CREMA-D/tree/master/docs#crema-d-crowd-sourced-emotional-multimodal-actors-dataset

## Filename labeling conventions
The Actor id is a 4 digit number at the start of the file. Each subsequent identifier is separated by an underscore (_).

Actors spoke from a selection of 12 sentences (in parentheses is the three letter acronym used in the second part of the filename):

* It's eleven o'clock (IEO).
* That is exactly what happened (TIE).
* I'm on my way to the meeting (IOM).
* I wonder what this is about (IWW).
* The airplane is almost full (TAI).
* Maybe tomorrow it will be cold (MTI).
* I would like a new alarm clock (IWL)
* I think I have a doctor's appointment (ITH).
* Don't forget a jacket (DFA).
* I think I've seen this before (ITS).
* The surface is slick (TSI).
* We'll stop in a couple of minutes (WSI).

The sentences were presented using different emotion (in parentheses is the three letter code used in the third part of the filename):

* Anger (ANG)
* Disgust (DIS)
* Fear (FEA)
* Happy/Joy (HAP)
* Neutral (NEU)
* Sad (SAD)

and emotion level (in parentheses is the two letter code used in the fourth part of the filename):

* Low (LO)
* Medium (MD)
* High (HI)
* Unspecified (XX)

The suffix of the filename is based on the type of file, flv for flash video used for presentation of both the video only, and the audio-visual clips. mp3 is used for the audio files used for the audio-only presentation of the clips. wav is used for files used for computational audio processing.

In [None]:
import wandb

In [None]:
!pip install transformers datasets evaluate accelerate librosa
!pip install --upgrade gdown

In [None]:
!pip install datasets==2.14.6
!pip install pandas==1.5.3

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from glob import glob

# from tqdm import tqdm
from tqdm.notebook import tqdm
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    recall_score,
    precision_score,
    accuracy_score,
    ConfusionMatrixDisplay,
    f1_score
)
from scipy.stats import spearmanr
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer
)
import matplotlib.pyplot as plt

SEED=3

import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input/crema-d/CREMA-D-master/AudioMP3'):
#     for filename in filenames:
#         print(filename)
save_path = "/kaggle/working"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Prepare Data

In [None]:
data = []

for path in tqdm(glob("/kaggle/input/d/return0root/crema-d/CREMA-D/AudioWAV/*.wav")):
    name = str(path).split('/')[-1].split('.')[0]
    actor_id, sentence, emotion, level = name.split('_')
    try:
        y,sr = librosa.load(path, sr=16000)
        data.append({
            "file": path,
            "actor_id": actor_id,
            "sentence": sentence,
            "label": emotion,
            "level": level
        })
    except Exception as e:
        raise(e)
df = pd.DataFrame(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head(2)

In [None]:
# SentenceFilenames.csv - list of movie files used in study
# finishedEmoResponses.csv - the first emotional response with timing.
# finishedResponses.csv - the final emotional Responses with emotion levels with repeated and practice responses removed, used to tabulate the votes

df_sentence = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/SentenceFilenames.csv')
df_first_resp = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/finishedEmoResponses.csv')
df_final_resp = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/finishedResponses.csv', low_memory=False)

In [None]:
df_first_resp['numTries'].value_counts()

In [None]:
df_final_resp['numTries'].value_counts()

In [None]:
train_df, dev_df = train_test_split(df, test_size=0.3, random_state=SEED,
                                    stratify=df["label"])
dev_df, test_df = train_test_split(dev_df, test_size=0.5, random_state=SEED,
                                   stratify=dev_df["label"])

train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# remove unused features in training models
# train_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)
# dev_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)
# test_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)

train_df.to_csv(f"{save_path}/train.csv", encoding="utf-8", index=False)
dev_df.to_csv(f"{save_path}/dev.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", encoding="utf-8", index=False)

print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

In [None]:
data_files = {
    "train": f"{save_path}/train.csv",
    "validation": f"{save_path}/dev.csv",
    "test": f"{save_path}/test.csv"
}

# train_dataset = train_df
# dev_dataset = dev_df
# test_dataset = test_df
# label_list = sorted(train_dataset['label'].unique())

dataset = load_dataset("csv", data_files=data_files)
train_dataset = dataset["train"]
dev_dataset = dataset["validation"]
test_dataset = dataset["test"]


print(dataset)

label_list = sorted(train_dataset.unique('label'))

In [None]:
# Base = 90M parameters; Large = 300M parameters

#model_name_or_path = "facebook/wav2vec2-base-960h" # “baseline” model; pre-trained on 960 hours of English
model_name_or_path = "facebook/wav2vec2-large-960h-lv60"
# model_name_or_path = "facebook/wav2vec2-base-el-voxpopuli-v2" # pre-trained on Greek speech, no fine-tuning
# model_name_or_path = "facebook/wav2vec2-large-el-voxpopuli-v2" # pre-trained on Greek speech, no fine-tuning
# model_name_or_path = "facebook/wav2vec2-xls-r-300m" # pre-trained on 0.5 million hours in multiple languages, no fine-tuning
# model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek" # pre-trained on 50000 hours in multiple languages, Greek ASR fine-tuning

# Feel free to look for and experiment with other models at HuggingFace Hub https://huggingface.co/

In [None]:
feature_extractor=AutoFeatureExtractor.from_pretrained(model_name_or_path)
model=AutoModelForAudioClassification.from_pretrained(model_name_or_path,
                                      num_labels=len(train_dataset.unique("label")),
                                      label2id={label: i for i, label in enumerate(label_list)},
                                      id2label={i: label for i, label in enumerate(label_list)}
                                      )
model.freeze_feature_encoder()

In [None]:
def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label
def prepare_example(example):
    example["audio"], example["sampling_rate"] = librosa.load(example["file"], sr=feature_extractor.sampling_rate)
    example["duration_in_seconds"] = len(example["audio"]) / feature_extractor.sampling_rate
    example["label"] = label_to_id(example["label"], label_list)
    return example
def preprocess_function(examples):
    audio_arrays = examples["audio"]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate
    )
    return inputs

In [None]:
# train_dataset = train_dataset.map(prepare_example, remove_columns=['file'])
# dev_dataset = dev_dataset.map(prepare_example, remove_columns=['file'])
# test_dataset = test_dataset.map(prepare_example, remove_columns=['file'])
# train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=1, remove_columns=['audio'])
# dev_dataset = dev_dataset.map(preprocess_function, batched=True, batch_size=1, remove_columns=['audio'])
# test_dataset = test_dataset.map(preprocess_function, batched=True, batch_size=1)

In [None]:
dataset = dataset.map(prepare_example, remove_columns=['file'])
dataset = dataset.map(preprocess_function, batched=True, batch_size=1)

In [None]:
# delete processed data
# !rm -rf /kaggle/working/data/preprocessed

In [None]:
dataset.save_to_disk(f"{save_path}/data/preprocessed/")

## Train

In [None]:
from datasets import load_from_disk

dataset = load_from_disk(f"{save_path}/data/preprocessed/")
train_dataset = dataset["train"]
dev_dataset = dataset["validation"]
test_dataset = dataset["test"]


print(dataset)

label_list = sorted(train_dataset.unique('label'))
label_list

In [None]:
# Batch size = per_device_train_batch_size * gradient_accumulation_steps
# Parameters to tune: learning rate, epochs, (batch size)
# More details on hyperparameter tuning in https://github.com/google-research/tuning_playbook

training_args = TrainingArguments(
    output_dir=f"{save_path}/{model_name_or_path}-speech-emotion-recognition",
    per_device_train_batch_size=64, # require more GPU memory, this set can exploit 16GB memory
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=64,
    num_train_epochs=15,
    warmup_ratio=0.1,
    learning_rate=1e-4,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    push_to_hub=False,
    gradient_checkpointing=True,
    fp16=True,
    report_to=None
)

In [None]:
def compute_metrics(pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(pred.label_ids, predictions)
    precision = precision_score(pred.label_ids, predictions, average='macro')
    recall = recall_score(pred.label_ids, predictions, average='macro')
    f1 = f1_score(pred.label_ids, predictions, average='macro')
    return {"accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=feature_extractor,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
if training_args.load_best_model_at_end:
    #trainer.evaluate(eval_dataset=test_dataset)
    predictions = trainer.predict(test_dataset)
    print(compute_metrics(predictions))

In [None]:
def map_to_pred(batch):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_values = feature_extractor(batch["audio"], sampling_rate=16000, return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values.to(device)).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    batch["predictions"] = predicted_ids
    return batch

In [None]:
label_names = [model.config.id2label[i] for i in range(model.config.num_labels)]
result = test_dataset.map(map_to_pred)
print(classification_report(result['label'], result['predictions'], target_names=label_names, digits=4))

cm = confusion_matrix(result['label'], result['predictions'], normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=label_names)

disp.plot(xticks_rotation = 'vertical')
plt.title(f"Confusion Matrix")
plt.show()