### Model configuration

In [1]:
import torch
import torchaudio
import torch.nn.functional as F
from src.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
from transformers import AutoConfig, Wav2Vec2FeatureExtractor
from tqdm import tqdm
import numpy as np
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/hubert-base-persian-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = HubertForSpeechClassification.from_pretrained(model_name_or_path).to(device)

Some weights of the model checkpoint at m3hrdadfi/hubert-base-persian-speech-emotion-recognition were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at m3hrdadfi/hubert-base-persian-speech-emotion-recognition and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0',

### Data Preparation

In [3]:
# Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
sentiment_mapping = {
    "1" : "Neutral",
    "3" : "Happiness",
    "4" : "Sadness",
    "5" : "Anger",
    "6" : "Fear",
    "8" : "Surprise"
}

In [66]:
def create_dataset(folder_path):
    Sadness = []
    Anger = []
    Happiness = []
    Surprise = []
    Fear = []
    Neutral = []

    for path in folder_path:
        audio_files = [f for f in os.listdir(path) if f.endswith('.wav')]

        for file in audio_files:
            sentiment_code = file[7]
            if sentiment_code == "4":
                Sadness.append(os.path.join(path, file))
            elif sentiment_code == "5":
                Anger.append(os.path.join(path, file))
            elif sentiment_code == "3":
                Happiness.append(os.path.join(path, file))
            elif sentiment_code == "8":
                Surprise.append(os.path.join(path, file))
            elif sentiment_code == "6":
                Fear.append(os.path.join(path, file))
            elif sentiment_code == "1":
                # print(os.path.join(path, file))
                Neutral.append(os.path.join(path, file))
    return Sadness, Anger, Happiness, Surprise, Fear, Neutral

In [62]:
# create list of paths of each sentiment
def path_of_each_sentiment(folder_path):
    audio_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]

    Sadness = []
    Anger = []
    Happiness = []
    Surprise = []
    Fear = []
    Neutral = []
    
    for file in audio_files:
        sentiment_code = file[7]
        if sentiment_code == "4":
            Sadness.append(os.path.join(folder_path, file))
        elif sentiment_code == "5":
            Anger.append(os.path.join(folder_path, file))
        elif sentiment_code == "3":
            Happiness.append(os.path.join(folder_path, file))
        elif sentiment_code == "8":
            Surprise.append(os.path.join(folder_path, file))
        elif sentiment_code == "6":
            Fear.append(os.path.join(folder_path, file))
        elif sentiment_code == "1":
            Neutral.append(os.path.join(folder_path, file))

    return Sadness, Anger, Happiness, Surprise, Fear, Neutral

### Model Implementation

In [23]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

In [24]:
def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get softmax scores
    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    
    # Get the index of the highest score
    max_index = scores.argmax()
    
    # Return the label with the highest score
    sentiment = config.id2label[max_index]
    
    return sentiment


In [25]:
def predict_sentiment(file_path, label):
    predicted = []
    for file in tqdm(file_path):
        output = predict(file, sampling_rate)
        if output == label:
            predicted.append(1)
        else:
            predicted.append(0)
    
    return np.array(predicted), np.ones(len(file_path))

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluation(actual_np, predicted_value):
    predicted_np = np.array(predicted_value)

    accuracy = accuracy_score(actual_np, predicted_np)
    precision = precision_score(actual_np, predicted_np)
    recall = recall_score(actual_np, predicted_np)
    f1 = f1_score(actual_np, predicted_np)

    matrix = {
        "accuracy" : accuracy,
        "precision" : precision,
        "recall" : recall,
        "f1_score" : f1
    }

    return matrix


### run the code

In [54]:
main_folder = r"C:\Learning\Machine-Learning\Deep_Learning_WorkSpace\files\RAVDESS"

In [57]:
# Get a list of all folders (subdirectories) within the main folder
folders_path = [os.path.join(main_folder, folder) for folder in os.listdir(main_folder) if os.path.isdir(os.path.join(main_folder, folder))]

In [69]:
actual_Sadness_paths, actual_Anger_paths, actual_Happiness_paths, actual_Surprise_paths, actual_Fear_paths, actual_Neutral_paths = create_dataset(folders_path)

In [70]:
# list pf predicted values and actual values 

predict_Sadness, actual_Sadness = predict_sentiment(actual_Sadness_paths, "Sadness")
predict_Anger, actual_Anger_value = predict_sentiment(actual_Anger_paths, "Anger")
predict_Happiness, actual_Happiness = predict_sentiment(actual_Happiness_paths, "Happiness")
predict_Surprise, actual_Surprise = predict_sentiment(actual_Surprise_paths, "Surprise")
predict_Fear, actual_Fear = predict_sentiment(actual_Fear_paths, "Fear")
predict_Neutral, actual_Neutral = predict_sentiment(actual_Neutral_paths, "Neutral")

100%|██████████| 192/192 [01:30<00:00,  2.11it/s]
100%|██████████| 192/192 [01:48<00:00,  1.78it/s]
100%|██████████| 192/192 [01:34<00:00,  2.04it/s]
100%|██████████| 192/192 [01:35<00:00,  2.02it/s]
100%|██████████| 192/192 [01:35<00:00,  2.01it/s]
100%|██████████| 96/96 [00:47<00:00,  2.02it/s]


In [71]:
#evaluation

matrix_Sadness = evaluation(predict_Sadness, actual_Sadness)
matrix_Anger = evaluation(predict_Anger, actual_Anger_value)
matrix_Hapiness = evaluation(predict_Happiness, actual_Happiness)
matrix_Surprise = evaluation(predict_Surprise, actual_Surprise)
matrix_Fear = evaluation(predict_Fear, actual_Fear)
matrix_Neutral = evaluation(predict_Neutral, actual_Neutral)


In [72]:
# get the results
print("|    Emotions   | precision | recall | f1-score | accuracy     |")
print("|:-------------:|:---------:|:------:|:--------:|:------------:|")
print(f"|   Anger       |   {round(matrix_Anger['precision']    ,2)}    |   {matrix_Anger['recall']     }  |   {round(matrix_Anger['f1_score']   ,2) }   |     {round(matrix_Anger['accuracy']      ,2)}     |")
print(f"|   Fear        |   {round(matrix_Fear['precision']     ,2)}    |   {matrix_Fear['recall']      }  |   {round(matrix_Fear['f1_score']    ,2) }   |     {round(matrix_Fear['accuracy']       ,2)}     |")
print(f"|   Hapiness    |   {round(matrix_Hapiness['precision'] ,2)}    |   {matrix_Hapiness['recall']  }  |   {round(matrix_Hapiness['f1_score'],2) }   |     {round(matrix_Hapiness['accuracy']   ,2)}     |")
print(f"|   Neutral     |   {round(matrix_Neutral['precision']  ,2)}    |   {matrix_Neutral['recall']   }  |   {round(matrix_Neutral['f1_score'] ,2) }    |     {round(matrix_Neutral['accuracy']    ,2)}     |")
print(f"|   Sadness     |   {round(matrix_Sadness['precision']  ,2)}    |   {matrix_Sadness['recall']   }  |   {round(matrix_Sadness['f1_score'] ,2) }   |     {round(matrix_Sadness['accuracy']    ,2)}     |")
print(f"|   Surprise    |   {round(matrix_Surprise['precision'] ,2)}    |   {matrix_Surprise['recall']  }  |   {round(matrix_Surprise['f1_score'],2) }   |     {round(matrix_Surprise['accuracy']   ,2)}     |")

|    Emotions   | precision | recall | f1-score | accuracy     |
|:-------------:|:---------:|:------:|:--------:|:------------:|
|   Anger       |   0.92    |   1.0  |   0.96   |     0.92     |
|   Fear        |   0.02    |   1.0  |   0.04   |     0.02     |
|   Hapiness    |   0.15    |   1.0  |   0.26   |     0.15     |
|   Neutral     |   0.81    |   1.0  |   0.9    |     0.81     |
|   Sadness     |   0.64    |   1.0  |   0.78   |     0.64     |
|   Surprise    |   0.18    |   1.0  |   0.31   |     0.18     |
