### Data Preparation

In [33]:
# Loading the necessary libraries

import os
import pandas as pd
import uuid
import librosa
import numpy as np


In [34]:
# Extracting emotions and placing it in its column

def extract_emotion(file_name):
    parts = file_name.split('_')
    emotion = parts[-1].split('.')[0]
    return emotion

def create_emotional_dataframe(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        emotion = extract_emotion(file_name)
        unique_id = f"{emotion}_{str(uuid.uuid4())[:8]}"
        file_path = os.path.join(folder_path, file_name)
        data.append({'ID': unique_id, 'File Name': file_name, 'Emotion': emotion, 'File Path': file_path})
        
    return pd.DataFrame(data)

folder_path_wav = r'dataverse_files'

emotion_df_wav = create_emotional_dataframe(folder_path_wav)

# Display the DataFrame
emotion_df_wav.head(10)

Unnamed: 0,ID,File Name,Emotion,File Path
0,angry_bcbcf8a2,OAF_back_angry.wav,angry,dataverse_files\OAF_back_angry.wav
1,disgust_52cdd1b9,OAF_back_disgust.wav,disgust,dataverse_files\OAF_back_disgust.wav
2,fear_adacc67f,OAF_back_fear.wav,fear,dataverse_files\OAF_back_fear.wav
3,happy_5d47cbfb,OAF_back_happy.wav,happy,dataverse_files\OAF_back_happy.wav
4,neutral_13dc91d7,OAF_back_neutral.wav,neutral,dataverse_files\OAF_back_neutral.wav
5,ps_cb173731,OAF_back_ps.wav,ps,dataverse_files\OAF_back_ps.wav
6,sad_0585a8cc,OAF_back_sad.wav,sad,dataverse_files\OAF_back_sad.wav
7,angry_9271fd76,OAF_bar_angry.wav,angry,dataverse_files\OAF_bar_angry.wav
8,disgust_f8c0ebfb,OAF_bar_disgust.wav,disgust,dataverse_files\OAF_bar_disgust.wav
9,fear_42e1a9de,OAF_bar_fear.wav,fear,dataverse_files\OAF_bar_fear.wav


In [35]:
def extract_features(file_path):
    # Load audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Extract MFCCs (Mel-Frequency Cepstral Coefficients)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    # Calculate pitch
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches)
    
    # Calculate energy
    energy = np.mean(librosa.feature.rms(y=y))
    
    # Calculate spectral features
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
    
    return mfccs, pitch, energy, spectral_centroid, spectral_bandwidth, spectral_contrast

In [36]:
audio_features = []
max_length = 0  # Track the maximum length of MFCC arrays

for file_path in emotion_df_wav['File Path']:
    mfccs, pitch, energy, spectral_centroid, spectral_bandwidth, spectral_contrast = extract_features(file_path)
    
    # Flatten MFCCs to ensure consistent shape
    mfccs_flat = mfccs.flatten()
    
    # Update max_length if necessary
    max_length = max(max_length, mfccs_flat.shape[0])
    
    # Combine all other features into a single 1D array
    features_combined = np.hstack((mfccs_flat, pitch, energy, spectral_centroid, spectral_bandwidth, spectral_contrast))
    
    audio_features.append(features_combined)

In [37]:
# Pad MFCC arrays to ensure consistent shapes
max_length = max(len(features) for features in audio_features)

padded_audio_features = []
for features in audio_features:
    # Calculate the amount of padding needed
    padding_width = max_length - len(features)
    # Pad the features array with zeros to match the maximum length
    padded_features = np.pad(features, ((0, padding_width)), mode='constant')
    padded_audio_features.append(padded_features)

# Convert the list of arrays to a 2D NumPy array
padded_audio_features_array = np.array(padded_audio_features)

print(padded_audio_features_array.shape)  # Display the shape of the padded array
print(padded_audio_features_array[:5])     # Display the first 5 elements of the padded array


(2800, 3697)
[[-616.8069458  -572.98529053 -487.13040161 ...    0.
     0.            0.        ]
 [-686.6395874  -612.41943359 -562.41546631 ...    0.
     0.            0.        ]
 [-675.70111084 -673.92712402 -634.75012207 ...    0.
     0.            0.        ]
 [-544.85992432 -491.78671265 -463.30151367 ...    0.
     0.            0.        ]
 [-765.62237549 -760.74121094 -764.26812744 ...    0.
     0.            0.        ]]


In [38]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets
train_data, temp_data = train_test_split(padded_audio_features_array, test_size=0.2, random_state=42)

# Further split the temporary data into validation and testing sets
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Display the shapes of the datasets
print("Training data shape:", train_data.shape)
print("Validation data shape:", validation_data.shape)
print("Testing data shape:", test_data.shape)


Training data shape: (2240, 3697)
Validation data shape: (280, 3697)
Testing data shape: (280, 3697)
