 Cargamos las librerías necesarias para el desarrollo de redes neuronales:

In [None]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from keras.applications import Xception, InceptionResNetV2
from keras.layers import Input, Dense, Flatten, Conv2D,Conv1D, MaxPooling2D, Dropout, AveragePooling2D,MaxPooling1D,AveragePooling1D,BatchNormalization
from keras.models import Sequential, Model
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
import zipfile
import shutil
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import librosa
import librosa.display
import seaborn as sns

Comprobamos que se está utilizando la GPU para reducir el tiempo de inferencia de las redes neuronales

In [None]:
device_type = "GPU"

def check_physical_devices(device_type):
    """Check that a device type is used in the host runtime."""
    physical_devices = tf.config.list_physical_devices(device_type)

    if not physical_devices:
        raise RuntimeError(f"No {device_type} devices are used in the host.")

# Detección de emociones mediante imágenes

## Carga de datos


Dividimos el conjunto de datos en entrenamiento,validación y prueba usando un muestreo estratificado

In [None]:
seed = 27965 # establish a seed to make the experiments reproducible as Keras doesn't asure that reproduciblity
#set the test and validation ratio
test_ratio = 0.2
validation_ratio = 0.2

# Seed is established
np.random.seed(seed)

# Directories are created, if they already exists then they are deleted previously
if os.path.isdir("validation"):
    shutil.rmtree("validation")
if os.path.isdir("train"):
    shutil.rmtree("train")
if os.path.isdir("test"):
    shutil.rmtree("test")

os.mkdir("validation")
os.mkdir("train")
os.mkdir("test")

emotions = os.listdir("../input/fer2013/train") #store the name of the emotions in a list

#remove 2 emotions to make more easy the training for the neural network
emotions.remove("surprise")
emotions.remove("disgust")

total_dataset_images = 0
#for each emotion create a folder
for emotion in emotions:    
    print(emotion)
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
    total_dataset_images += len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    total_dataset_images += len([f for f in os.listdir(path2) if os.path.isfile(os.path.join(path2, f))])
    os.mkdir("validation/" + str(emotion))
    os.mkdir("train/" + str(emotion))
    os.mkdir("test/" + str(emotion))
   
print("Total amount of images in the dataset: " + str(total_dataset_images))

#calculate the percentage and the number of images per class to achieve the stratified sampling
percentages_per_class_dict = {}
for emotion in emotions:    
    print(emotion)
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
    percentages_per_class_dict[emotion] = (len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]) + len([f for f in os.listdir(path2) if os.path.isfile(os.path.join(path2, f))]))/total_dataset_images

images_per_class_dict = {}
for emotion in emotions:    
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
    images_of_emotion = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + [f for f in os.listdir(path2) if os.path.isfile(os.path.join(path2, f))]
    np.random.shuffle(images_of_emotion)
    images_per_class_dict[emotion] = images_of_emotion

images_for_test = int(total_dataset_images * test_ratio)
images_for_train_total = total_dataset_images - images_for_test
images_for_validation = int(images_for_train_total * validation_ratio)
images_for_train = images_for_train_total - images_for_validation
   
images_per_class_test_proportional_dict = {}
for emotion in emotions:    
    images_per_class_test_proportional_dict[emotion] = int(images_for_test*percentages_per_class_dict[emotion])
   
images_per_class_train_proportional_dict = {}
for emotion in emotions:    
    images_per_class_train_proportional_dict[emotion] = int(images_for_train*percentages_per_class_dict[emotion])
   
images_per_class_validation_proportional_dict = {}
for emotion in emotions:    
    images_per_class_validation_proportional_dict[emotion] = int(images_for_validation*percentages_per_class_dict[emotion])
   
#Test images are copied to the test folder
for emotion in emotions:
    print(emotion)
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
   
    images_amount = images_per_class_test_proportional_dict[emotion]
    emotion_images = images_per_class_dict.get(emotion)
    while images_amount > 0:
        image = emotion_images.pop(0)
        images_per_class_dict[emotion] = emotion_images
       
        if os.path.isfile(os.path.join(path, image)):
            shutil.copy(path + "/" + image, "./test/" + emotion)
        else:
            shutil.copy(path2 + "/" + image, "./test/" + emotion)
       
        images_amount -= 1
       
print("Images moved to test folder")
       
# Train images are copied to the train folder
for emotion in emotions:
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
   
    images_amount = images_per_class_train_proportional_dict[emotion]
    emotion_images = images_per_class_dict.get(emotion)
    while images_amount > 0:
        image = emotion_images.pop(0)
        images_per_class_dict[emotion] = emotion_images
       
        if os.path.isfile(os.path.join(path, image)):
            shutil.copy(path + "/" + image, "./train/" + emotion)
        else:
            shutil.copy(path2 + "/" + image, "./train/" + emotion)
       
        images_amount -= 1
       
print("Images moved to train folder")
       
# Validation images are copied to the train folder
for emotion in emotions:
    path = "../input/fer2013/test/" + emotion
    path2 = "../input/fer2013/train/" + emotion
   
    images_amount = images_per_class_validation_proportional_dict[emotion]
    emotion_images = images_per_class_dict.get(emotion)
    while images_amount > 0:
        image = emotion_images.pop(0)
        images_per_class_dict[emotion] = emotion_images
       
        if os.path.isfile(os.path.join(path, image)):
            shutil.copy(path + "/" + image, "./validation/" + emotion)
        else:
            shutil.copy(path2 + "/" + image, "./validation/" + emotion)
           
        images_amount -= 1

print("Images moved to validation folder")

Definimos el generador de imágenes para el training, validation y test, mediante ImagenDataGenerator.

En el training aplicaremos data augmentation para evitar el sobreajuste ya que así añadimos algo de ruido a las imágenes:

* Normalizamos los valores de los píxeles entre 0 y 1
* Aplicamos un 0.2 de zoom
* Añadimos un desplazamiento de la imagen tanto vertical como horizontal de 0.2
* Finalmente establecemos un rango para el brillo de la imagen de entre [0.1,0.7]


In [None]:

training_imagen_generator = ImageDataGenerator(rescale=1/255,zoom_range=0.2,brightness_range=[0.1,0.7], width_shift_range=0.2,
    height_shift_range=0.2,)
validation_imagen_generator = ImageDataGenerator(rescale=1/ 255)
test_imagen_generator = ImageDataGenerator(rescale=1/ 255)

Ahora generamos los iteradores para cada generador creado previamente. 

Definimos un valor de batch de 64, lo que significa que se propagarán 64 imágenes en paralelo a través de la red neuronal, y establecemos un tamaño de imagen final.


In [None]:
directory="train"
batch= 64
target_size = (48,48)
training_iterator = training_imagen_generator.flow_from_directory(seed=seed,
                                                     directory=directory,
                                                     batch_size=batch,
                                                    target_size=target_size, color_mode = "grayscale")



In [None]:
directory="validation"
validation_iterator = validation_imagen_generator.flow_from_directory(seed=seed,
                                                     directory=directory,
                                                     batch_size=batch,
                                                     target_size=target_size,color_mode = "grayscale")

In [None]:
directory="test"
test_iterator = test_imagen_generator.flow_from_directory(seed=seed,
                                                     directory=directory,
                                                     batch_size=batch,
                                                     target_size=target_size,color_mode = "grayscale")

## Arquitectura de la red neuronal

A continuación se han ido creando las diversas arquitecturas de redes convolucionales para hacer pruebas

In [None]:
activation = "relu"
input_shape=(*target_size,1)

convolutional_layer_1 = Conv2D(32, 12, activation=activation, input_shape=input_shape)
convolutional_layer_2 = Conv2D(64, 9, activation=activation, padding='same' )
convolutional_layer_3 = Conv2D(128,3, activation=activation, padding='same')
convolutional_layer_4 = Conv2D(128, 3, activation=activation, padding='same')
convolutional_layer_5 = Conv2D(128, 3, activation=activation, padding='same')
convolutional_layer_6 = Conv2D(128, 3, activation=activation, padding='same')

avg_pooling_layer_1 = AveragePooling2D(3)
avg_pooling_layer_2 = AveragePooling2D(2)
avg_pooling_layer_3 = AveragePooling2D(2)

max_pooling_layer_1 = MaxPooling2D(3)
max_pooling_layer_2 = MaxPooling2D(2)
max_pooling_layer_3 = MaxPooling2D(2)
max_pooling_layer_4 = MaxPooling2D(2)

flatten_layer = Flatten(data_format="channels_last")

dropout_layer_1 = Dropout(0.25, seed=seed)
dropout_layer_2 = Dropout(0.25, seed=seed)
dropout_layer_3 = Dropout(0.25, seed=seed)

hidden_layer_1 = Dense(1024, activation=activation)
hidden_layer_2 = Dense(512, activation=activation)
hidden_layer_3 = Dense(512, activation=activation)

activation = "softmax"
output_layer = Dense(3, activation='softmax')


model = Sequential()

model.add(convolutional_layer_1)
model.add(max_pooling_layer_1)

model.add(convolutional_layer_2)
model.add(convolutional_layer_3)
model.add(max_pooling_layer_2)
model.add(dropout_layer_1)
model.add(convolutional_layer_4)
model.add(max_pooling_layer_3)
model.add(convolutional_layer_5)
model.add(max_pooling_layer_4)
model.add(dropout_layer_2)

model.add(flatten_layer)

model.add(dropout_layer_3)


model.add(hidden_layer_2)

model.add(output_layer)


lr = 1e-3
optimizer = Adam(lr=lr)
metrics = "accuracy"
loss = "categorical_crossentropy"

#show the model
from keras.utils.vis_utils import plot_model
# model.summary()
plot_model(model, to_file='model_plot_50.png', show_shapes=True, show_layer_names=True)

Compilamos el modelo creado y comienza el entrenamiento de la red. Para ello, se selecciona el conjunto de entrenamiento, y se realiza una validación con el conjunto de validación.

In [None]:
model.compile(loss=loss,
              metrics=metrics,
                  optimizer=optimizer)
checkpoint = ModelCheckpoint("best.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max', save_weights_only=True)
epochs = 50
generator = training_iterator 
validation_data = validation_iterator 
probabilities = model.fit_generator(epochs=epochs,generator=generator, validation_data = validation_data, callbacks=[checkpoint])
model.save("best_image_model_3_2.h5")

Ahora procedemos a la predicción utilizando el conjunto de prueba, una vez la red ha sido entrenada

In [None]:
generator = test_iterator
convolutional_neural_network_predictions = model.predict(generator)

Mostramos los resultados en una matriz de confusión

In [None]:

y_pred = [np.argmax(probas) for probas in  convolutional_neural_network_predictions]
y_test = test_iterator.classes
class_names = test_iterator.class_indices.keys()
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(10,10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
# compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Normalized confusion matrix')
plt.show()



## ensemble de imágenes

Creamos un ensemble para utilizar varios modelos con la intención de mejorar los resultados obtenidos.

In [None]:
def load_models(models):
    loaded_models = []
    for i,model in enumerate(models):
        mod = load_model(model)
        mod._name = f"model_{i}" 
        loaded_models.append(mod)
    return loaded_models



def ensemble(models,input_layer,final_layers):
    models = load_models(models)
    output_layer = [model(input_layer) for model in models]
    if isinstance(final_layers,list):
        for layer in final_layers:
            output_layer = layer(output_layer)
    else:
        output_layer = final_layers(output_layer)
    return Model(input_layer, output_layer)

In [None]:
from keras.models import load_model
from keras.layers import Average


#models with 5 emotions
model1_5 = "../input/image-models/image_5_emotions_new.h5"
model2_5 = "../input/image-models/image_5_emotions_new_2.h5"
model3_5 = "../input/image-models/image_5_emotions_new_3.h5"

models=[model1_5,model2_5, model3_5]
input_layer = Input(shape=(48,48,1))
output_layer = Average()

ensemble =ensemble(models,input_layer,output_layer)

In [None]:
ensemble.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
ensemble.save('ensemble_image_5_emotions".h5')

Evaluamos el ensemble

In [None]:
ensemble.evaluate(test_iterator)

In [None]:
ensemble_predictions = ensemble.predict(test_iterator)

In [None]:

y_pred = [np.argmax(probas) for probas in  ensemble_predictions]
y_test = test_iterator.classes
class_names = test_iterator.class_indices.keys()
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(10,10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
# compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Normalized confusion matrix')
plt.show()



In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred
df['Actual Labels'] = y_test

df.head(10)

# Detección mediante audio

Creamos un dataframe con la ruta de cada archivo WAV junto con la emoción que expresa cada uno, para poder trabajar con los datos

In [None]:
data = "../input/cremad/AudioWAV/"
crema_directory_list = os.listdir(data)

file_emotion = []
file_path = []

angry = 0
fear = 0
dis = 0
neu = 0

for file in crema_directory_list:   
    part=file.split('_')
    if part[2] == 'SAD':
        file_path.append(data + file)
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        angry += 1
        file_path.append(data + file)
        file_emotion.append('angry')
    elif part[2] == 'HAP':
        file_path.append(data + file)
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_path.append(data + file)
        file_emotion.append('neutral')
        dis+=1
    elif part[2] == 'FEA':
        file_path.append(data + file)
        file_emotion.append('fear')
        fear+=1
    
    
print("angry : "+ str(angry))
print("disg: " +str(dis))
print("feaar: "+ str(fear))       
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
Crema_df.Path
Crema_df.Emotions
Crema_df.head()



Obtenemos conjuntos de train y test

In [None]:
X = Crema_df.Path
Y = Crema_df.Emotions


# Discretizamos la variable y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()


# dividimos en test y train
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=1, train_size = 0.8, test_size = 0.2, shuffle=True)
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, train_size = 0.5, test_size=0.5, random_state=1, shuffle=True)


x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_valid.shape, y_valid.shape


Realizamos data augmentation al conjunto de entrenamiento para evitar sobreajuste. 
Añadimos a los audios ruido, cambiamos la velocidad, el tono de la voz y el tiempo

In [None]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data


def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

#cogemos el primer elemento del conjunto de entrenamiento como ejemplo
path = np.array(x_train)[0]
data, sample_rate = librosa.load(path)
sample_rate

Representamos un ejemplo

In [None]:
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    librosa.display.waveplot(data, sr=sr)
    plt.show()

def create_spectrogram(data, sr, e):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')   
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

In [None]:
plt.figure(figsize=(14,4))
librosa.display.waveplot(y=data, sr=sample_rate)
Audio(path)


In [None]:
x = noise(data)
plt.figure(figsize=(14,4))
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

In [None]:
x = stretch(data)
plt.figure(figsize=(14,4))
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

In [None]:
x = pitch(data, sample_rate)
plt.figure(figsize=(14,4))
librosa.display.waveplot(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

Seleccionamos las siguientes transformaciones para obtener características y transformar el audio en un formato válido, usando sample_rate y sample_data :


In [None]:
def extract_features(data):
    # ZCR: tasa de cambio de signo de la señal
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally
    

#     Chroma_stft: a partir de una onda calcula un cromagrama
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally
    

    # MFCC: representan el habla en función a la percepción humana
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
    

    # Root Mean Square Value: amplitud media cuadrada en un intervalo de tiempo
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally
    

    # MelSpectogram : espectograma
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally

    
    return result

def get_features(path):
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    return np.array(result)

def get_features_no_augmentation(path):
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    return result

Aplicamos las características con el data augmentation al conjunto de entrenamiento

In [None]:
X1, Y1 = [], []

# for path, emotion  in training():
cont = 0
for path, emotion in zip(x_train, y_train): 
    feature = get_features(path)
    for ele in feature:
        cont+=1
        X1.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y1.append(emotion)



Aplicamos las características al conjunto de test

In [None]:
X2, Y2 = [], []

# for path, emotion  in test():
for path, emotion in zip(x_test, y_test):
    feature = get_features_no_augmentation(path) 
    X2.append(feature)
    Y2.append(emotion)

Aplicamos las características al conjunto de validación


In [None]:
X3, Y3 = [], []

# for path, emotion  in validation():
for path, emotion in zip(x_valid, y_valid):
    feature = get_features_no_augmentation(path) 
    X3.append(feature)
    Y3.append(emotion)

## entrenamiento de la red

escalamos y preparamos los datos para hacerlos compatibles para la red

In [None]:
y_train =np.array(Y1)
y_test = np.array(Y2)
x_train = np.array(X1)
x_test  =  np.array(X2)
x_validation = np.array(X3)
y_validation = np.array(Y3)

scaler = StandardScaler()
x_train_model = scaler.fit_transform(x_train)
x_valid_model = scaler.fit_transform(x_validation)
x_test_model = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape



In [None]:
x_train = np.expand_dims(x_train_model, axis=2)
x_validation = np.expand_dims(x_valid_model, axis=2)
x_test = np.expand_dims(x_test_model, axis=2)
x_train.shape, y_train.shape, x_validation.shape, y_validation.shape, x_test.shape, y_test.shape

Creamos las diversas arquitecturas de red para hacer pruebas

In [None]:
from keras.layers import LSTM,BatchNormalization, Activation, Bidirectional, GRU,AveragePooling1D, MaxPooling1D

model = Sequential()

model.add(Conv1D(64, kernel_size=9, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(Conv1D(128, kernel_size=9, strides=1, padding='same', activation='relu'))
model.add(AveragePooling1D(pool_size=2, strides = 2, padding = 'same'))


model.add(Conv1D(128, kernel_size=9, strides=1, padding='same', activation='relu'))
model.add(AveragePooling1D(pool_size=2, strides = 2, padding = 'same'))
model.add(Dropout(0.25))

model.add(Conv1D(128, kernel_size=9, strides=1, padding='same', activation='relu'))
model.add(Conv1D(256, kernel_size=6, strides=1, padding='same', activation='relu'))
model.add(AveragePooling1D(pool_size=3, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu'))
model.add(Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu'))
model.add(AveragePooling1D(pool_size=3, strides = 2, padding = 'same'))
model.add(Dropout(0.25))


model.add(Conv1D(256, kernel_size=3, strides=2, padding='same', activation='relu'))
model.add(Conv1D(256, kernel_size=3, strides=2, padding='same', activation='relu'))
model.add(AveragePooling1D(pool_size=3, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=512, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(units=128, activation='relu')) 


model.add(Dense(5, activation="softmax"))
model.summary()
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy',  metrics = ['accuracy'])

hist = model.fit(x_train, y_train, validation_data= (x_validation, y_validation), epochs = 30, batch_size = 32)

Guardamos y evaluamos los resultados con el conjunto de test

In [None]:
model.save('audio_6_model_5_emotions_new.h5')

In [None]:
model.evaluate(x_test,y_test)[1]*100

In [None]:
# predecimos
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score( y_test,y_pred)

Mostramos los resultados

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

[](http://)

## Ensemble de audio

Creamos un ensemble para unir varios modelos del audio con intención de mejorar los resultados obtenidos

In [None]:
def load_models(models):
    loaded_models = []
    for i,model in enumerate(models):
        mod = load_model(model)
        mod._name = f"model_{i}" 
        loaded_models.append(mod)
    return loaded_models

def ensemble(models,input_layer,final_layers):
    models = load_models(models)
    output_layer = [model(input_layer) for model in models]
    if isinstance(final_layers,list):
        for layer in final_layers:
            output_layer = layer(output_layer)
    else:
        output_layer = final_layers(output_layer)
    return Model(input_layer, output_layer)

In [None]:
from keras.models import load_model
from keras.layers import Average

#models with 5 emotions
model1="../input/audio-models/audio_1_model_5_emotions.h5"
model2= "../input/audio-models/audio_2_model_5_emotions.h5"
model3= "../input/audio-models/audio_3_model_5_emotions.h5"
model4= "../input/audio-models/audio_4_model_5_emotions.h5"
model5= "../input/audio-models/audio_5_model_5_emotions.h5"
model6= "../input/audio-models/audio_6_model_5_emotions.h5"

models =[model1, model2, model3, model4, model5, model6]

input_shape = (162, 1)
model_input = Input(shape=input_shape)
output_layer= Average()

ensemble =ensemble(models,model_input,output_layer)



Compilamos y evaluamos los resultados

In [None]:
ensemble.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
ensemble.save('ensemble_audio_5_emotions.h5')

In [None]:
ensemble.evaluate(x_test,y_test)[1]*100

In [None]:
pred_test = ensemble.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)
y_test = encoder.inverse_transform(y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

Mostramos los resultados

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(15)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()
