### Model configuration

In [1]:
import torch
import torchaudio
import torch.nn.functional as F
from src.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
from transformers import AutoConfig, Wav2Vec2FeatureExtractor
from tqdm import tqdm
import numpy as np
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/hubert-base-persian-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = HubertForSpeechClassification.from_pretrained(model_name_or_path).to(device)

Some weights of the model checkpoint at m3hrdadfi/hubert-base-persian-speech-emotion-recognition were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at m3hrdadfi/hubert-base-persian-speech-emotion-recognition and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0',

### Data preparation

In [None]:
# Define the sentiment mapping
sentiment_mapping = {
    "S": "Sadness",
    "A": "Anger",
    "H": "Happiness",
    "W": "Surprise",
    "F": "Fear",
    "N": "Neutral"
}

In [3]:
# Get a list of all .wav files in the folder
def folder_to_list(folder_path):
    audio_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
    # get entire path
    audio_files = [os.path.join(folder_path, f) for f in audio_files]
    return audio_files

In [4]:
# extract sentiment from file name
def extract_sentiment(file_name):
    sentiment_code = file_name[-7]  # Assuming format always has sentiment at the third last character
    
    return sentiment_mapping.get(sentiment_code, "Unknown")

# create a list of sentiments from file names
def list_sentiments_from_files(folder_path):
    # Get a list of all .wav files in the folder
    audio_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
    
    sentiments = []
    for file in audio_files:
        sentiment = extract_sentiment(file)
        sentiments.append(sentiment)
    
    return sentiments

In [5]:
# create list of paths of each sentiment
def path_of_each_sentiment(folder_path):
    audio_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]

    Sadness = []
    Anger = []
    Happiness = []
    Surprise = []
    Fear = []
    Neutral = []
    
    for file in audio_files:
        sentiment_code = file[-7]
        if sentiment_code == "S":
            Sadness.append(os.path.join(folder_path, file))
        elif sentiment_code == "A":
            Anger.append(os.path.join(folder_path, file))
        elif sentiment_code == "H":
            Happiness.append(os.path.join(folder_path, file))
        elif sentiment_code == "W":
            Surprise.append(os.path.join(folder_path, file))
        elif sentiment_code == "F":
            Fear.append(os.path.join(folder_path, file))
        elif sentiment_code == "N":
            Neutral.append(os.path.join(folder_path, file))

    return Sadness, Anger, Happiness, Surprise, Fear, Neutral

### Model Implementation

In [6]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

In [7]:
def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get softmax scores
    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    
    # Get the index of the highest score
    max_index = scores.argmax()
    
    # Return the label with the highest score
    sentiment = config.id2label[max_index]
    
    return sentiment


In [None]:
def predict_sentiment(file_path, label):
    predicted = []
    for file in tqdm(file_path):
        output = predict(file, sampling_rate)
        if output == label:
            predicted.append(1)
        else:
            predicted.append(0)
    
    return np.array(predicted), np.ones(len(file_path))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluation(actual_np, predicted_value):
    predicted_np = np.array(predicted_value)

    accuracy = accuracy_score(actual_np, predicted_np)
    precision = precision_score(actual_np, predicted_np)
    recall = recall_score(actual_np, predicted_np)
    f1 = f1_score(actual_np, predicted_np)

    matrix = {
        "accuracy" : accuracy,
        "precision" : precision,
        "recall" : recall,
        "f1_score" : f1
    }

    return matrix


### Run the code for female audio

In [23]:
# Folder containing all your audio files
folder_path = r"C:\Learning\Machine-Learning\Deep_Learning_WorkSpace\files\female"

In [26]:
# lists containg paths of sentiment 
actual_Sadness_paths, actual_Anger_paths, actual_Happiness_paths, actual_Surprise_paths, actual_Fear_paths, actual_Neutral_paths = path_of_each_sentiment(folder_path)

In [29]:
# list pf predicted values and actual values 

predict_Sadness, actual_Sadness = predict_sentiment(actual_Sadness_paths, "Sadness")
predict_Anger, actual_Anger_value = predict_sentiment(actual_Anger_paths, "Anger")
predict_Happiness, actual_Happiness = predict_sentiment(actual_Happiness_paths, "Happiness")
predict_Surprise, actual_Surprise = predict_sentiment(actual_Surprise_paths, "Surprise")
predict_Fear, actual_Fear = predict_sentiment(actual_Fear_paths, "Fear")
predict_Neutral, actual_Neutral = predict_sentiment(actual_Neutral_paths, "Neutral")

100%|██████████| 271/271 [02:54<00:00,  1.55it/s]
100%|██████████| 455/455 [03:38<00:00,  2.09it/s]
100%|██████████| 111/111 [00:55<00:00,  1.99it/s]
100%|██████████| 120/120 [00:33<00:00,  3.61it/s]
100%|██████████| 22/22 [00:09<00:00,  2.40it/s]
100%|██████████| 284/284 [03:55<00:00,  1.20it/s]


In [30]:
#evaluation

matrix_Sadness = evaluation(predict_Sadness, actual_Sadness)
matrix_Anger = evaluation(predict_Anger, actual_Anger_value)
matrix_Hapiness = evaluation(predict_Happiness, actual_Happiness)
matrix_Surprise = evaluation(predict_Surprise, actual_Surprise)
matrix_Fear = evaluation(predict_Fear, actual_Fear)
matrix_Neutral = evaluation(predict_Neutral, actual_Neutral)


In [46]:
# get the results
print("|    Emotions   | precision | recall | f1-score | accuracy     |")
print("|:-------------:|:---------:|:------:|:--------:|:------------:|")
print(f"|   Anger       |   {round(matrix_Anger['precision']    ,2)}    |   {matrix_Anger['recall']     }  |   {round(matrix_Anger['f1_score']   ,2) }   |     {round(matrix_Anger['accuracy']      ,2)}     |")
print(f"|   Fear        |   {round(matrix_Fear['precision']     ,2)}    |   {matrix_Fear['recall']      }  |   {round(matrix_Fear['f1_score']    ,2) }   |     {round(matrix_Fear['accuracy']       ,2)}     |")
print(f"|   Hapiness    |   {round(matrix_Hapiness['precision'] ,2)}    |   {matrix_Hapiness['recall']  }  |   {round(matrix_Hapiness['f1_score'],2) }   |     {round(matrix_Hapiness['accuracy']   ,2)}     |")
print(f"|   Neutral     |   {round(matrix_Neutral['precision']  ,2)}    |   {matrix_Neutral['recall']   }  |   {round(matrix_Neutral['f1_score'] ,2) }    |     {round(matrix_Neutral['accuracy']    ,2)}     |")
print(f"|   Sadness     |   {round(matrix_Sadness['precision']  ,2)}    |   {matrix_Sadness['recall']   }  |   {round(matrix_Sadness['f1_score'] ,2) }   |     {round(matrix_Sadness['accuracy']    ,2)}     |")
print(f"|   Surprise    |   {round(matrix_Surprise['precision'] ,2)}    |   {matrix_Surprise['recall']  }  |   {round(matrix_Surprise['f1_score'],2) }   |     {round(matrix_Surprise['accuracy']   ,2)}     |")

|    Emotions   | precision | recall | f1-score | accuracy     |
|:-------------:|:---------:|:------:|:--------:|:------------:|
|   Anger       |   0.98    |   1.0  |   0.99   |     0.98     |
|   Fear        |   0.59    |   1.0  |   0.74   |     0.59     |
|   Hapiness    |   0.94    |   1.0  |   0.97   |     0.94     |
|   Neutral     |   0.99    |   1.0  |   1.0    |     0.99     |
|   Sadness     |   0.85    |   1.0  |   0.92   |     0.85     |
|   Surprise    |   0.87    |   1.0  |   0.93   |     0.87     |


### run the code for male audio

In [49]:
# Folder containing all your audio files
folder_path_male = r"C:\Learning\Machine-Learning\Deep_Learning_WorkSpace\files\male"

In [50]:
# lists containg paths of sentiment 
actual_Sadness_paths, actual_Anger_paths, actual_Happiness_paths, actual_Surprise_paths, actual_Fear_paths, actual_Neutral_paths = path_of_each_sentiment(folder_path_male)

In [51]:
# list pf predicted values and actual values 

predict_Sadness, actual_Sadness = predict_sentiment(actual_Sadness_paths, "Sadness")
predict_Anger, actual_Anger_value = predict_sentiment(actual_Anger_paths, "Anger")
predict_Happiness, actual_Happiness = predict_sentiment(actual_Happiness_paths, "Happiness")
predict_Surprise, actual_Surprise = predict_sentiment(actual_Surprise_paths, "Surprise")
predict_Fear, actual_Fear = predict_sentiment(actual_Fear_paths, "Fear")
predict_Neutral, actual_Neutral = predict_sentiment(actual_Neutral_paths, "Neutral")

100%|██████████| 178/178 [02:05<00:00,  1.42it/s]
100%|██████████| 604/604 [05:18<00:00,  1.89it/s]
100%|██████████| 90/90 [00:47<00:00,  1.90it/s]
100%|██████████| 105/105 [00:29<00:00,  3.59it/s]
100%|██████████| 16/16 [00:07<00:00,  2.09it/s]
100%|██████████| 744/744 [07:23<00:00,  1.68it/s]


In [52]:
#evaluation

matrix_Sadness = evaluation(predict_Sadness, actual_Sadness)
matrix_Anger = evaluation(predict_Anger, actual_Anger_value)
matrix_Hapiness = evaluation(predict_Happiness, actual_Happiness)
matrix_Surprise = evaluation(predict_Surprise, actual_Surprise)
matrix_Fear = evaluation(predict_Fear, actual_Fear)
matrix_Neutral = evaluation(predict_Neutral, actual_Neutral)


In [54]:
# get the results
print("|    Emotions   | precision | recall | f1-score | accuracy     |")
print("|:-------------:|:---------:|:------:|:--------:|:------------:|")
print(f"|   Anger       |   {round(matrix_Anger['precision']    ,2)}    |   {matrix_Anger['recall']     }  |   {round(matrix_Anger['f1_score']   ,2) }   |     {round(matrix_Anger['accuracy']      ,2)}     |")
print(f"|   Fear        |   {round(matrix_Fear['precision']     ,2)}    |   {matrix_Fear['recall']      }  |   {round(matrix_Fear['f1_score']    ,2) }   |     {round(matrix_Fear['accuracy']       ,2)}     |")
print(f"|   Hapiness    |   {round(matrix_Hapiness['precision'] ,2)}    |   {matrix_Hapiness['recall']  }  |   {round(matrix_Hapiness['f1_score'],2) }   |     {round(matrix_Hapiness['accuracy']   ,2)}     |")
print(f"|   Neutral     |   {round(matrix_Neutral['precision']  ,2)}    |   {matrix_Neutral['recall']   }  |   {round(matrix_Neutral['f1_score'] ,2) }   |     {round(matrix_Neutral['accuracy']    ,2)}     |")
print(f"|   Sadness     |   {round(matrix_Sadness['precision']  ,2)}    |   {matrix_Sadness['recall']   }  |   {round(matrix_Sadness['f1_score'] ,2) }   |     {round(matrix_Sadness['accuracy']    ,2)}     |")
print(f"|   Surprise    |   {round(matrix_Surprise['precision'] ,2)}    |   {matrix_Surprise['recall']  }  |   {round(matrix_Surprise['f1_score'],2) }   |     {round(matrix_Surprise['accuracy']   ,2)}     |")

|    Emotions   | precision | recall | f1-score | accuracy     |
|:-------------:|:---------:|:------:|:--------:|:------------:|
|   Anger       |   0.97    |   1.0  |   0.99   |     0.97     |
|   Fear        |   0.69    |   1.0  |   0.81   |     0.69     |
|   Hapiness    |   0.96    |   1.0  |   0.98   |     0.96     |
|   Neutral     |   0.99    |   1.0  |   0.99   |     0.99     |
|   Sadness     |   0.78    |   1.0  |   0.88   |     0.78     |
|   Surprise    |   0.79    |   1.0  |   0.88   |     0.79     |
