In [24]:
# Import libraries 
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import torch
import pandas as pd
import glob 
from sklearn.metrics import confusion_matrix
import IPython.display as ipd  # To play sound in the notebook
import os
import sys
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [25]:
DATA_RAW = '../data/raw'
TESS = f'{DATA_RAW}/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/'
RAV = f'{DATA_RAW}/ravdess-emotional-speech-audio/audio_speech_actors_01-24/'
SAVEE = f'{DATA_RAW}/surrey-audiovisual-expressed-emotion-savee/ALL/'
CREMA = f'{DATA_RAW}/cremad/AudioWAV/'

# Run one example 
dir_list = os.listdir(SAVEE)
dir_list[0:5]

['DC_d03.wav', 'KL_d15.wav', 'DC_sa12.wav', 'DC_n19.wav', 'DC_f07.wav']

In [34]:
# Get the data location for SAVEE
dir_list = os.listdir(SAVEE)

emo_dict = {'_a': 'male_angry', '_d': 'male_disgust', '_f': 'male_fear',
            '_h': 'male_happy', '_n': 'male_neutral', 'sa': 'male_sad'}
# parse the filename to get the emotions
emotion=[]
path = []
for i in dir_list:
    if i[-8:-6] in emo_dict:
        emotion.append(emo_dict[i[-8:-6]])
    # else:
    #     emotion.append('male_error') 
    path.append(SAVEE + i)
    
# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()

male_neutral    120
male_disgust     60
male_fear        60
male_happy       60
male_angry       60
male_sad         60
Name: labels, dtype: int64

In [35]:
dir_list = os.listdir(RAV)
dir_list.sort()

emotion = []
gender = []
path = []
for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        if (int(part[2]) != 8):
            emotion.append(int(part[2]))
            temp = int(part[6])
            if temp%2 == 0:
                temp = "female"
            else:
                temp = "male"
            gender.append(temp)
            path.append(RAV + i + '/' + f)

        
RAV_df = pd.DataFrame(emotion)
RAV_df = RAV_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust'})
RAV_df = pd.concat([pd.DataFrame(gender),RAV_df],axis=1)
RAV_df.columns = ['gender','emotion']
RAV_df['labels'] =RAV_df.gender + '_' + RAV_df.emotion
RAV_df['source'] = 'RAVDESS'  
RAV_df = pd.concat([RAV_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAV_df = RAV_df.drop(['gender', 'emotion'], axis=1)
RAV_df.labels.value_counts()

male_neutral      144
female_neutral    144
male_fear          96
female_angry       96
female_disgust     96
female_fear        96
male_angry         96
male_sad           96
female_happy       96
male_disgust       96
male_happy         96
female_sad         96
Name: labels, dtype: int64

In [36]:
#The speakers and the emotions are organised in seperate folders which is very convenient
dir_list = os.listdir(TESS)
dir_list.sort()

In [37]:
path = []
emotion = []

emo_dict = {'an': 'female_angry', 'di': 'female_disgust', 'fe': 'female_fear', 'ha': 'female_happy',
            'ne': 'female_neutral', 'sa': 'female_sad'}
for i in dir_list:
    fname = os.listdir(TESS + i)
    print(i.lower())
    for f in fname:
        now_emotional = i.lower()[4:6] 
        if  now_emotional in emo_dict:
            emotion.append(emo_dict[now_emotional])
        # else:
        #     emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()

oaf_fear
oaf_pleasant_surprise
oaf_sad
oaf_angry
oaf_disgust
oaf_happy
oaf_neutral
yaf_angry
yaf_disgust
yaf_fear
yaf_happy
yaf_neutral
yaf_pleasant_surprised
yaf_sad


female_angry      400
female_disgust    400
female_happy      400
female_fear       400
female_neutral    400
female_sad        400
Name: labels, dtype: int64

In [38]:
dir_list = os.listdir(CREMA)
dir_list.sort()

gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
          1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]

emo_dict = {'SAD': '_sad', 'ANG': '_angry', 'DIS': '_disgust', 'FEA': '_fear',
            'HAP': '_happy', 'NEU': '_neutral'}

for i in dir_list: 
    part = i.split('_')
    if int(part[0]) in female:
        temp = 'female'
    else:
        temp = 'male'
    gender.append(temp)
    if part[2] in emo_dict:
        emotion.append(temp + emo_dict[part[2]])
    else:
        emotion.append('Unknown')
    path.append(CREMA + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()

male_sad          671
male_fear         671
male_disgust      671
male_happy        671
male_angry        671
female_angry      600
female_happy      600
female_sad        600
female_disgust    600
female_fear       600
male_neutral      575
female_neutral    512
Name: labels, dtype: int64

In [39]:
df = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
print(df.labels.value_counts())
df.head()
# df.to_csv("../data/processed/Data_path.csv",index=False)

female_angry      1096
female_happy      1096
female_sad        1096
female_fear       1096
female_disgust    1096
female_neutral    1056
male_neutral       839
male_angry         827
male_sad           827
male_fear          827
male_disgust       827
male_happy         827
Name: labels, dtype: int64


Unnamed: 0,labels,source,path
0,male_disgust,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
1,male_disgust,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
2,male_sad,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
3,male_neutral,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...
4,male_fear,SAVEE,../data/raw/surrey-audiovisual-expressed-emoti...


In [32]:
SAVEE_df.to_csv("../data/interim/SAVEE_df.csv",index=False)
RAV_df.to_csv("../data/interim/RAV_df.csv",index=False)
TESS_df.to_csv("../data/interim/TESS_df.csv",index=False)
CREMA_df.to_csv("../data/interim/CREMA_df.csv",index=False)