In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa

import torch
import torchaudio
import torch.nn.functional as F

### Data Preparation

In [None]:
## custom dataset
from torch.utils.data import Dataset, DataLoader
class AudioDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)
    
    def __getitem(self, x):
        waveform, sample_rate = torchaudio.load(self.file_paths[x])
        return waveform, sample_rate

In [38]:
import os

# Folder containing all your audio files
folder_path = r"C:\Learning\Machine-Learning\Deep_Learning_WorkSpace\files\female"

# Get a list of all .wav files in the folder
def folder_to_list(folder_path):
    audio_files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
    # get entire path
    audio_files = [os.path.join(folder_path, f) for f in audio_files]
    return audio_files

In [20]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

### Model Implementation

In [32]:
from src.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
from transformers import AutoConfig, Wav2Vec2FeatureExtractor

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/hubert-base-persian-speech-emotion-recognition"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = HubertForSpeechClassification.from_pretrained(model_name_or_path).to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at m3hrdadfi/hubert-base-persian-speech-emotion-recognition were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassifica

In [35]:
def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    with torch.no_grad():
        logits = model(**inputs).logits
    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs

In [39]:
file_paths = folder_to_list(folder_path)

In [43]:
file_paths[100]

'C:\\Learning\\Machine-Learning\\Deep_Learning_WorkSpace\\files\\female\\F02S15.wav'

In [45]:
outputs = predict(file_paths[100], sampling_rate)
outputs

[{'Label': 'Anger', 'Score': '0.0%'},
 {'Label': 'Fear', 'Score': '0.0%'},
 {'Label': 'Happiness', 'Score': '0.0%'},
 {'Label': 'Neutral', 'Score': '3.1%'},
 {'Label': 'Sadness', 'Score': '96.7%'},
 {'Label': 'Surprise', 'Score': '0.2%'}]