In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader 

import os 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import cv2

import librosa
import librosa.display
import skimage.io

In [3]:
os.path.split('data/speech-emotion-recognition-ravdess-data/Actor_16/03-01-05-01-02-01-16.wav')

('data/speech-emotion-recognition-ravdess-data/Actor_16',
 '03-01-05-01-02-01-16.wav')

In [4]:
def get_image_path(file_path):
    path_list = file_path.split('/')
    img_path = path_list[-1]
    
     # settings
    hop_length = 512 # number of samples per time-step in spectrogram
    n_mels = 128 # number of bins in spectrogram. Height of image
    time_steps = 384 # number of time-steps. Width of image

    # load audio. Using example from librosa
    y, sr = librosa.load(file_path, sr=22050)
    y = y[20000:60000]
    y = np.resize(y, (256, 256))
    out = 'data/images2/' + img_path[:-4] + '.npy'
    
    # Save as numpy array
    np.save(out, y)
    
    return out

In [6]:
ravdess_emotions = {'01':'neutral', '02':'calm', '03':'happy', '04':'sad',\
                   '05':'angry', '06':'fear', '07':'disgust','08':'surprise'}

In [7]:
processed_data = []
# Folder path of the data 
data_path = 'data/speech-emotion-recognition-ravdess-data/'

# Extracting Label of audio clip from the file name based on the name of the dataset. 
for root, dirs, files in os.walk(data_path):
    for file in files:
        if file.endswith('.DS_Store'):
            continue
        file_path = os.path.join(root, file)
        # Split the file name based on dataset
        file_breakdown = file.split('-')
        if file_breakdown[3] == '01' and file_breakdown[5] == '01':
            emotion = ravdess_emotions[file_breakdown[2]]
            emotion_id = int(file_breakdown[2])
            statement = int(file_breakdown[4])
            actor = int(file_breakdown[6][:-4])

            img_path = get_image_path(file_path)
            #print(emotion)
            processed_data.append([file_path, actor, emotion, emotion_id, statement, img_path])
print("All Images Written")
# Compute Recipe Outputs: 
speech_data_processed_df = pd.DataFrame(processed_data, columns=['file_name','actor','emotions','emotion_id','statement','image_path'])

All Images Written


In [8]:
speech_data_processed_df = speech_data_processed_df.sort_values(by=['actor','statement','emotion_id'], ignore_index=True)
speech_data_processed_df.head(16)

Unnamed: 0,file_name,actor,emotions,emotion_id,statement,image_path
0,data/speech-emotion-recognition-ravdess-data/A...,1,neutral,1,1,data/images2/03-01-01-01-01-01-01.npy
1,data/speech-emotion-recognition-ravdess-data/A...,1,calm,2,1,data/images2/03-01-02-01-01-01-01.npy
2,data/speech-emotion-recognition-ravdess-data/A...,1,happy,3,1,data/images2/03-01-03-01-01-01-01.npy
3,data/speech-emotion-recognition-ravdess-data/A...,1,sad,4,1,data/images2/03-01-04-01-01-01-01.npy
4,data/speech-emotion-recognition-ravdess-data/A...,1,angry,5,1,data/images2/03-01-05-01-01-01-01.npy
5,data/speech-emotion-recognition-ravdess-data/A...,1,fear,6,1,data/images2/03-01-06-01-01-01-01.npy
6,data/speech-emotion-recognition-ravdess-data/A...,1,disgust,7,1,data/images2/03-01-07-01-01-01-01.npy
7,data/speech-emotion-recognition-ravdess-data/A...,1,surprise,8,1,data/images2/03-01-08-01-01-01-01.npy
8,data/speech-emotion-recognition-ravdess-data/A...,1,neutral,1,2,data/images2/03-01-01-01-02-01-01.npy
9,data/speech-emotion-recognition-ravdess-data/A...,1,calm,2,2,data/images2/03-01-02-01-02-01-01.npy


In [9]:
speech_data_processed_df.to_csv('data/RAVDESS_processed_dataset_2.csv')

In [11]:
np.empty((2, 3, 5))

array([[[0.00000000e+000, 3.31023983e-322, 0.00000000e+000,
         0.00000000e+000, 8.48798316e-314],
        [1.16095484e-028, 7.47870832e+247, 4.82410605e+228,
         1.01392767e-076, 9.80422498e+252],
        [9.98586082e-077, 1.25229219e+219, 1.46412291e-028,
         5.49497060e-096, 5.07507647e+246]],

       [[4.10067162e+223, 6.01347002e-154, 6.01347002e-154,
         1.46461594e-028, 5.49497762e-096],
        [1.23368276e+184, 9.98586082e-077, 2.16222309e+190,
         4.74694520e-038, 1.75471250e+243],
        [1.04919172e-153, 9.78750380e+199, 6.97843734e+252,
         2.65698126e-312, 0.00000000e+000]]])