In [7]:
import time
import os
import numpy as np
import wave
import librosa
from scipy.stats import zscore
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, TimeDistributed
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Flatten
from tensorflow.keras.layers import LSTM

emotions = {
    0: 'Angry',
    1: 'Disgust',
    2: 'Fear',
    3: 'Happy',
    4: 'Neutral',
    5: 'Sad',
    6: 'Surprise'
}

def get_emotion_label(key):
  return emotions.get(key, 'Emoção não encontrada')


In [8]:
def mel_spectrogram(y, sr=16000, n_fft=512, win_length=256, hop_length=128, window='hamming', n_mels=128,
                    fmax=4000):

    # Compute spectogram
    mel_spect = np.abs(
        librosa.stft(y, n_fft=n_fft, window=window, win_length=win_length, hop_length=hop_length)) ** 2

    # Compute mel spectrogram
    mel_spect = librosa.feature.melspectrogram(S=mel_spect, sr=sr, n_mels=n_mels, fmax=fmax)

    # Compute log-mel spectrogram
    mel_spect = librosa.power_to_db(mel_spect, ref=np.max)

    return np.asarray(mel_spect)

In [9]:
 def frame(y, win_step=64, win_size=128):

    # Number of frames
    nb_frames = 1 + int((y.shape[2] - win_size) / win_step)

    # Framming
    frames = np.zeros((y.shape[0], nb_frames, y.shape[1], win_size)).astype(np.float16)
    for t in range(nb_frames):
        frames[:, t, :, :] = np.copy(y[:, :, (t * win_step):(t * win_step + win_size)]).astype(np.float16)

    return frames

In [10]:
def build_model():

    # Clear Keras session
    K.clear_session()

    # Define input
    input_shape = Input(shape=(5, 128, 128, 1))

    # First LFLB (local feature learning block)
    y = TimeDistributed(Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same'))(input_shape)
    y = TimeDistributed(BatchNormalization())(y)
    y = TimeDistributed(Activation('elu'))(y)
    y = TimeDistributed(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))(
        y)
    y = TimeDistributed(Dropout(0.2))(y)

    # Second LFLB (local feature learning block)
    y = TimeDistributed(Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same'))(y)
    y = TimeDistributed(BatchNormalization())(y)
    y = TimeDistributed(Activation('elu'))(y)
    y = TimeDistributed(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))(
        y)
    y = TimeDistributed(Dropout(0.2))(y)

    # Third LFLB (local feature learning block)
    y = TimeDistributed(Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same'))(y)
    y = TimeDistributed(BatchNormalization())(y)
    y = TimeDistributed(Activation('elu'))(y)
    y = TimeDistributed(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))(
        y)
    y = TimeDistributed(Dropout(0.2))(y)

    # Fourth LFLB (local feature learning block)
    y = TimeDistributed(Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same'))(y)
    y = TimeDistributed(BatchNormalization())(y)
    y = TimeDistributed(Activation('elu'))(y)
    y = TimeDistributed(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))(
        y)
    y = TimeDistributed(Dropout(0.2))(y)

    # Flat
    y = TimeDistributed(Flatten())(y)

    # LSTM layer
    y = LSTM(256, return_sequences=False, dropout=0.2)(y)

    # Fully connected
    y = Dense(7, activation='softmax')(y)

    model = Model(inputs=input_shape, outputs=y)

    return model

In [25]:
def predict_emotion_from_file(filename, chunk_step=16000, chunk_size=49100, predict_proba=False,
                              sample_rate=16000):

    # Read audio file
    y, sr = librosa.core.load(filename, sr=sample_rate, offset=0.5)

    if y.shape[0] < chunk_size:
        y_padded = np.zeros(chunk_size)
        y_padded[:y.shape[0]] = y
        y = y_padded


    # Split audio signals into chunks
    chunks = frame(y.reshape(1, 1, -1), chunk_step, chunk_size)

    # Reshape chunks
    chunks = chunks.reshape(chunks.shape[1], chunks.shape[-1])

    # Z-normalization
    y = np.asarray(list(map(zscore, chunks)))

    # Compute mel spectrogram
    mel_spect = np.asarray(list(map(mel_spectrogram, y)))

    # Time distributed Framing
    mel_spect_ts = frame(mel_spect)

    # Build X for time distributed CNN
    X = mel_spect_ts.reshape(mel_spect_ts.shape[0],
                              mel_spect_ts.shape[1],
                              mel_spect_ts.shape[2],
                              mel_spect_ts.shape[3],
                              1)

    # Predict emotion
    if predict_proba is True:
        predict = model.predict(X)
    else:
        predict = np.argmax(model.predict(X), axis=1)
        predict = [emotion.get(emotion) for emotion in predict]

    # Clear Keras session
    K.clear_session()

    # Predict timestamp
    timestamp = np.concatenate([[chunk_size], np.ones((len(predict) - 1)) * chunk_step]).cumsum()
    timestamp = np.round(timestamp / sample_rate)

    return predict, timestamp

In [23]:
def prediction_to_csv(predictions, filename, mode='w'):

    # Write emotion in filename
    with open(filename, mode) as f:
        if mode == 'w':
            f.write("EMOTIONS" + '\n')
        for emotion in predictions:
            f.write(str(emotion) + '\n')
        f.close()

In [14]:
ravdess_emotions = {
    '01': 'Neutral',
    '02': 'Neutral',
    '03': 'Happy',
    '04': 'Sad',
    '05': 'Angry',
    '06': 'Fearful',
    '07': 'Disgust',
    '08': 'Surprised'
}


In [30]:
audios_path = '/content/drive/My Drive/UEPG/eng-comp/5-ano/tcc-v2/SER/audios-from-videos-1-folder'

test_with_actors = ["19", "20", "21", "22", "23", "24"]
print("Starting")

print("Loading model...")
model = load_model('/content/drive/My Drive/UEPG/eng-comp/5-ano/tcc-v2/SER/hold-out/best_model.hdf5')
print("Loading model weights...")
model.load_weights('/content/drive/My Drive/UEPG/eng-comp/5-ano/tcc-v2/SER/hold-out/model-weights.h5')
print("Model loaded!")

done = 0

process_actors = []
audios = sorted(os.listdir(audios_path))

for a in audios:
  if a != ".gitkeep" and a != "README.md" and a[18:20] in test_with_actors and a[0:2] == '01':
    process_actors.append(a)

total = len(process_actors)

with open('/content/drive/My Drive/UEPG/eng-comp/5-ano/tcc-v2/SER/hold-out/results-hold-out.csv', mode='a+') as file:
  for audio in process_actors:
    done += 1
    audio_path = audios_path + '/' + audio
    correct_class = audio[6:8]
    emotions, timestamp = predict_emotion_from_file(audio_path, predict_proba=True, chunk_step=1 * 16000)
    predicted_class = np.argmax(emotions, axis=1)[0]

    file.write("{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f},{},{}".format(
        emotions[0][0],
        emotions[0][1],
        emotions[0][2],
        emotions[0][3],
        emotions[0][4],
        emotions[0][5],
        emotions[0][6],
        audio,
        correct_class
    ))
    file.write('\n')

    print("Audio {} of {} - ({}%)".format(
        done,
        total,
        round((done / total) * 100, 2)
    ))

Starting
Loading model...
Loading model weights...
Model loaded!
Audio 1 of 360 - (0.28%)
Audio 2 of 360 - (0.56%)
Audio 3 of 360 - (0.83%)
Audio 4 of 360 - (1.11%)
Audio 5 of 360 - (1.39%)
Audio 6 of 360 - (1.67%)
Audio 7 of 360 - (1.94%)
Audio 8 of 360 - (2.22%)
Audio 9 of 360 - (2.5%)
Audio 10 of 360 - (2.78%)
Audio 11 of 360 - (3.06%)
Audio 12 of 360 - (3.33%)
Audio 13 of 360 - (3.61%)
Audio 14 of 360 - (3.89%)
Audio 15 of 360 - (4.17%)
Audio 16 of 360 - (4.44%)
Audio 17 of 360 - (4.72%)
Audio 18 of 360 - (5.0%)
Audio 19 of 360 - (5.28%)
Audio 20 of 360 - (5.56%)
Audio 21 of 360 - (5.83%)
Audio 22 of 360 - (6.11%)
Audio 23 of 360 - (6.39%)
Audio 24 of 360 - (6.67%)
Audio 25 of 360 - (6.94%)
Audio 26 of 360 - (7.22%)
Audio 27 of 360 - (7.5%)
Audio 28 of 360 - (7.78%)
Audio 29 of 360 - (8.06%)
Audio 30 of 360 - (8.33%)
Audio 31 of 360 - (8.61%)
Audio 32 of 360 - (8.89%)
Audio 33 of 360 - (9.17%)
Audio 34 of 360 - (9.44%)
Audio 35 of 360 - (9.72%)
Audio 36 of 360 - (10.0%)
Audio 37 of

In [34]:
from csv import reader

emotions = {
    '05': 0,
    '07': 1,
    '06': 2,
    '03': 3,
    '01': 4,
    '04': 5,
    '08': 6,
    '02': 4
}

correct_predictions = 0
incorrect_predictions = 0
total_predictions = 0


with open('/content/drive/My Drive/UEPG/eng-comp/5-ano/tcc-v2/SER/hold-out/results-hold-out.csv', mode='r') as file:

  content  = reader(file)

  for row in content:
      correct_class = row[8]
      # print(correct_class)
      key = emotions.get(correct_class)
      prediction_for_correct_class = row[key]

      predictions = row[:7]
      key_max_value = predictions.index(max(predictions))

      if key_max_value == emotions.get(correct_class):
          correct_predictions += 1
      else:
          incorrect_predictions += 1

      total_predictions += 1

print("Total predictions = {}".format(total_predictions))
print("Correct predictions = {} - ({}%)".format(correct_predictions, round((correct_predictions / total_predictions) * 100, 2)))
print("Incorrect predictions = {} - ({}%)".format(incorrect_predictions, round((incorrect_predictions / total_predictions) * 100, 2)))


Total predictions = 360
Correct predictions = 224 - (62.22%)
Incorrect predictions = 136 - (37.78%)
