In [25]:
import os
import pandas as pd
import librosa
import math
from tqdm import tqdm
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from pydub import AudioSegment

In [26]:
RAVDESS_SONG = 'Data/RAVDESS-SONG/'
RAVDESS_SPEECH = 'Data/RAVDESS-SPEECH/'
SAVEE = 'Data/SAVEE/'
CREMAD = 'Data/CREMAD/'
TESS = 'Data/TESS/'

In [27]:
dir_list = os.listdir(SAVEE)

emotion=[]
path = []

for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('angry')
    elif i[-8:-6]=='_d':
        emotion.append('disgust')
    elif i[-8:-6]=='_f':
        emotion.append('fear')
    elif i[-8:-6]=='_h':
        emotion.append('happy')
    elif i[-8:-6]=='_n':
        emotion.append('neutral')
    elif i[-8:-6]=='sa':
        emotion.append('sad')
    elif i[-8:-6]=='su':
        emotion.append('surprise')
    
    path.append(SAVEE + i)
    
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df['path'] = path
SAVEE_df.labels.value_counts()

neutral     120
angry        60
sad          60
fear         60
happy        60
surprise     60
disgust      60
Name: labels, dtype: int64

In [28]:
dir_list = os.listdir(RAVDESS_SPEECH)
dir_list.sort()
dir_list.pop(0)

emotion = []
path = []

for i in dir_list:
    
    fname = os.listdir(RAVDESS_SPEECH + i)
    
    for f in fname[:-1]:
        parsed = f.split('.')[0].split('-')
        emotion.append(int(parsed[2]))
        path.append(RAVDESS_SPEECH + i + '/' + f)
        
RAV_df_1 = pd.DataFrame(emotion, columns = ['labels'])
RAV_df_1 = RAV_df_1.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAV_df_1['source'] = 'RAVDESS_SPEECH'  
RAV_df_1['path'] = path
RAV_df_1.labels.value_counts()

neutral     282
angry       191
disgust     191
fear        191
sad         191
surprise    185
happy       185
Name: labels, dtype: int64

In [29]:
dir_list = os.listdir(RAVDESS_SONG)
dir_list.sort()
dir_list.pop(0)

emotion = []
path = []

for i in dir_list:
    
    fname = os.listdir(RAVDESS_SONG + i)
    
    for f in fname:
        
        parsed = f.split('.')[0].split('-')
        emotion.append(int(parsed[2]))
        path.append(RAVDESS_SONG + i + '/' + f)
        
RAV_df_2 = pd.DataFrame(emotion, columns = ['labels'])
RAV_df_2 = RAV_df_2.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAV_df_2['source'] = 'RAVDESS_SONG'  
RAV_df_2['path'] = path
RAV_df_2.labels.value_counts()

neutral    276
fear       184
happy      184
angry      184
sad        184
Name: labels, dtype: int64

In [30]:
dir_list = os.listdir(TESS)
dir_list.sort()
dir_list.pop(0)

path = []
emotion = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('sad')
        
        path.append(TESS + i + '/' + f)

print(len(path))
print(len(emotion))
TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df['path'] = path
TESS_df.labels.value_counts()

2800
2800


neutral     400
disgust     400
fear        400
surprise    400
angry       400
happy       400
sad         400
Name: labels, dtype: int64

In [31]:
dir_list = os.listdir(CREMAD)

emotion = []
path = []

for i in dir_list: 
    part = i.split('_')
    if part[2] == 'SAD':
        emotion.append('sad')
    elif part[2] == 'ANG':
        emotion.append('angry')
    elif part[2] == 'DIS':
        emotion.append('disgust')
    elif part[2] == 'FEA':
        emotion.append('fear')
    elif part[2] == 'HAP':
        emotion.append('happy')
    elif part[2] == 'NEU':
        emotion.append('neutral')
    path.append(CREMAD + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df['path'] = path
CREMA_df.labels.value_counts()

disgust    1271
fear       1271
angry      1271
happy      1271
sad        1271
neutral    1087
Name: labels, dtype: int64

In [32]:
df = pd.concat([SAVEE_df, RAV_df_1, RAV_df_2, TESS_df, CREMA_df], axis = 0)
df = df.reset_index(drop=True)
df['int_label'] = df['labels'].rank(method='dense', ascending=False).astype(int)
df['int_label'] -= 1
df['int_label'].value_counts()

2    2165
6    2106
4    2106
1    2106
3    2100
5    1922
0     645
Name: int_label, dtype: int64

In [33]:
df.source.unique()

array(['SAVEE', 'RAVDESS_SPEECH', 'RAVDESS_SONG', 'TESS', 'CREMA'],
      dtype=object)

In [34]:
def check_duration(df):
    max_dur = 0
    
    for index, row in tqdm(df.iterrows()):
        
        SAMPLE_RATE = 22050
        y, sr = librosa.load(row.path)

        TRACK_DURATION = librosa.get_duration(y=y, sr=SAMPLE_RATE)
        if TRACK_DURATION > max_dur:
            max_dur = TRACK_DURATION
    
    return max_dur

In [15]:
print(check_duration(df))

13150it [11:33, 18.96it/s]

7.138730158730159





In [41]:
def pad_audio(df):
    
    for index, row in tqdm(df.iterrows()):
        
        SAMPLE_RATE = 22050
        y, sr = librosa.load(row.path, sr=SAMPLE_RATE)
        TRACK_DURATION = librosa.get_duration(y=y, sr=SAMPLE_RATE)

        pad_ms = (7.5 * 1000) - (TRACK_DURATION * 1000) # milliseconds of silence needed
        silence = AudioSegment.silent(duration=pad_ms)
        audio = AudioSegment.from_file(row.path)

        padded = audio + silence  # Adding silence after the audio
        padded.export(row.path, format='wav')

In [42]:
pad_audio(df)

0it [00:00, ?it/s]


EOFError: 

In [10]:
def extract_mfcc(df):
    
    data = {
            "source": [],
            "labels": [],
            "int_labels":[],
            "mfcc": []
        }

    for index, row in tqdm(df.iterrows()):

        SAMPLE_RATE = 22050
        y, sr = librosa.load(row.path)

        TRACK_DURATION = librosa.get_duration(y=y, sr=SAMPLE_RATE)
        SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION


        samples_per_segment = SAMPLE_RATE
        num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / 512)
        num_segments = int(SAMPLES_PER_TRACK/samples_per_segment)

        for d in range(num_segments):

            # calculate start and finish sample for current segment
            start = samples_per_segment * d
            finish = start + samples_per_segment

            # extract mfcc
            mfcc = librosa.feature.mfcc(y[start:finish], SAMPLE_RATE, n_mfcc=40, n_fft=2048, hop_length=512)
            mfcc = mfcc.T

            # store only mfcc feature with expected number of vectors
            if len(mfcc) == num_mfcc_vectors_per_segment:
                data['source'].append(row.source)
                data['labels'].append(row.labels)
                data['int_labels'].append(row.int_label)
                data["mfcc"].append(mfcc.tolist())
    return data


13150it [14:35, 15.02it/s]


In [None]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def extract_melspec(df):
    
    for emotion in tqdm(df['emotion'].unique()):
        
        temp_df = df.loc[df['emotion']==emotion]
    
        for index, row in tqdm(temp_df.iterrows()):

            SAMPLE_RATE = 22050
            y, sr = librosa.load(row.path)

            mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)


    

In [11]:
data = extract_mfcc(df)

with open('Data/MFCC/mfcc.json', "w") as fp:
    json.dump(data, fp, indent=4)

In [12]:
X = np.array(data["mfcc"])
y = np.array(data["int_labels"])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [14]:
model = keras.Sequential()

# 1st conv layer
model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)))
model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

# 2nd conv layer
model.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

# 3rd conv layer
model.add(keras.layers.Conv2D(32, (2, 2), activation='relu'))
model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

# 4th conv layer
model.add(keras.layers.Conv2D(32, (1, 1), activation='relu'))
model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
model.add(keras.layers.BatchNormalization())

# flatten output and feed it into dense layer
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.3))

# output layer
model.add(keras.layers.Dense(7, activation='softmax'))

optimiser = keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimiser,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 42, 38, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 21, 19, 32)        0         
_________________________________________________________________
batch_normalization (BatchNo (None, 21, 19, 32)        128       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 19, 17, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 10, 9, 32)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 10, 9, 32)         128       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 8, 32)          4

In [15]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

KeyboardInterrupt: 