In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import pickle
import re
import librosa
import librosa.display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import soundfile
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten, BatchNormalization, Dropout, ELU, LSTM, Reshape
from keras.models import Model, Sequential
from keras.applications import VGG16, Xception
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.preprocessing import OneHotEncoder
import keras
from scipy.io import wavfile
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Global Variables

In [2]:
sr = 16000
seg_size = 8

emotion_dict = {'ang': 0,
                'hap': 1,
                'exc': 2,
                'sad': 3,
                'fru': 4,
                'fea': 5,
                'sur': 6,
                'neu': 7,
                'dis': 8,
                'xxx': 9,
                'oth': 9}

### Reading Labels

In [3]:
def read_labels(datapath):
    info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)

    start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []

    evaluation_files = [l for l in os.listdir(datapath) if 'Ses' in l]
    for file in evaluation_files:
        with open(datapath + file) as f:
            content = f.read()

        info_lines = re.findall(info_line, content)

        for line in info_lines[1:]:  # the first line is a header
            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\t')
            start_time, end_time = start_end_time[1:-1].split('-')

            val, act, dom = val_act_dom[1:-1].split(',')
            val, act, dom = float(val), float(act), float(dom)

            start_time, end_time = float(start_time), float(end_time)

            start_times.append(start_time)
            end_times.append(end_time)
            wav_file_names.append(wav_file_name)
            emotions.append(emotion)
            vals.append(val)
            acts.append(act)
            doms.append(dom)

    df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])

    df_iemocap['start_time'] = start_times
    df_iemocap['end_time'] = end_times
    df_iemocap['wav_file'] = wav_file_names
    df_iemocap['emotion'] = emotions
    df_iemocap['val'] = vals
    df_iemocap['act'] = acts
    df_iemocap['dom'] = doms

    df_iemocap.to_csv('df_iemocap.csv', index=False)

    return df_iemocap

### Reading audio files

In [4]:
def read_audio(wav_file_path, labels_df):

    audio_vectors = {}
    orig_wav_files = os.listdir(wav_file_path)
    
    for orig_wav_file in tqdm(orig_wav_files):

        #samplerate, orig_wav_vector = wavfile.read(wav_file_path + orig_wav_file)
        orig_wav_vector, samplerate = librosa.load(wav_file_path + orig_wav_file, sr=sr, mono=False)

        left = orig_wav_vector[0]
        right = orig_wav_vector[1]

        orig_wav_file, file_format = orig_wav_file.split('.')

        for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():

            start, end, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row[
                'end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']

            frames = []

            fright = right[int(start * samplerate):int(end * samplerate)]
            fleft = left[int(start * samplerate):int(end * samplerate)]

            length = int(fright.shape[0] / samplerate)

            if length > seg_size:
                for i in range(math.ceil(length/seg_size)):

                    start_f = i * seg_size
                    end_f = start_f + seg_size

                    if i == math.ceil(length/seg_size) - 1 and int(length/seg_size) == i:
                        end_f = length
                        start_f = length - seg_size
                    
                    frames.append(fright[int(start_f * samplerate):int(end_f * samplerate)])
                    frames.append(fleft[int(start_f * samplerate):int(end_f * samplerate)])
                    

            elif length < seg_size:
                padded_fl = np.pad(fleft, ((length%2) * samplerate, math.floor((seg_size-length)/2)* 2 * samplerate), 'mean')
                padded_fr = np.pad(fright, ((length%2) * samplerate, math.floor((seg_size-length)/2)* 2 * samplerate), 'mean')

                frames.append(padded_fl[:seg_size * samplerate])
                frames.append(padded_fr[:seg_size * samplerate])

            else:
                frames.append(fright[:seg_size * samplerate])
                frames.append(fleft[:seg_size * samplerate])
            
            
            audio_vectors[truncated_wav_file_name] = frames
                

    with open('audio_vectors1.pkl', 'wb') as f:
        pickle.dump(audio_vectors, f)

    return audio_vectors


### Extracting features

In [5]:
def extract_audio_features(audio_vectors, labels_df, emotion_dict):
    file_name, x, y = [], [], []
    i=0
    for index, row in tqdm(labels_df.iterrows()):
        i+=1
        wav_file_name = row['wav_file']
        label = emotion_dict[row['emotion']]
        audio = audio_vectors[wav_file_name]
    
        for frame in audio:
            mfcc = np.array(librosa.feature.mfcc(y=frame, sr=sr))
            mfcc_mean = mfcc.mean(axis=1)
            mfcc_min = mfcc.min(axis=1)
            mfcc_max = mfcc.max(axis=1)
            mfcc_feature = np.concatenate( (mfcc_mean, mfcc_min, mfcc_max) )
     
            file_name.append(wav_file_name)
            x.append(mfcc_feature.reshape(1, -1).tolist()[0])
            y.append(label)
        
        if i == 267:
            break
    
    df = pd.concat([pd.DataFrame(file_name, columns=["file_name"]), pd.DataFrame(x), pd.DataFrame(y, columns=["label"])], axis=1)
    #df.to_csv('audios_features.csv', index=False)
    
    return df

In [6]:
def preprocess(audio_vectors, labels_df, emotion_dict):
    file_name, x, y = [], [], []
    i=0
    for index, row in tqdm(labels_df.iterrows()):
        i+=1
        wav_file_name = row['wav_file']
        label = emotion_dict[row['emotion']]
        audio = audio_vectors[wav_file_name]
    
        for frame in audio:
            file_name.append(wav_file_name)
            x.append(frame.reshape(1, -1).tolist()[0])
            y.append(label)
        if i == 1000:
            break
    df = pd.concat([pd.DataFrame(file_name, columns=["file_name"]), pd.DataFrame(x), pd.DataFrame(y, columns=["label"])], axis=1)
    df.to_csv('audios_features.csv', index=False)
    
    return df

In [7]:
def features_2d(audio_vectors, labels_df, emotion_dict):
    file_name, x, y = [], [], []
    i=0
    for index, row in tqdm(labels_df.iterrows()):
        i+=1
        wav_file_name = row['wav_file']
        label = emotion_dict[row['emotion']]
        audio = audio_vectors[wav_file_name]
    
        for frame in audio:
            mfcc = np.array(librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=128))
            chroma = np.array(librosa.feature.chroma_stft(y=frame, sr=sr, n_chroma=128))
            mel = np.array(librosa.feature.melspectrogram(y=frame, sr=sr, n_mels=128))
            
            mel = librosa.power_to_db(mel)
            mel = mel.astype(np.float32)
            
            chroma = librosa.power_to_db(chroma)
            chroma = chroma.astype(np.float32)
            
            mfcc = librosa.power_to_db(mfcc)
            mfcc = mfcc.astype(np.float32)
            
            file_name.append(wav_file_name)
            feature = [mfcc, chroma, mel]
            x.append(feature)
            y.append(label)
        
#         if i == 16:
#             break
    
    x = np.array(x).reshape(-1, mel.shape[0], mel.shape[1], 3)
    print(x.shape)
    # df = pd.concat([pd.DataFrame(file_name, columns=["file_name"]), pd.DataFrame(x), pd.DataFrame(y, columns=["label"])], axis=1)
    # df.to_csv('audios_features.csv', index=False)
    
    with open('x1.pkl', 'wb') as f:
        pickle.dump(x, f)
        
    with open('y1.pkl', 'wb') as f:
        pickle.dump(y, f)
        
    print('done')
    
    return x, y, mel.shape

### Keras Model

In [8]:
def create_model(input_shape, n_units):
    model = Sequential()
    model.add(Input(shape=input_shape))
    
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    
    model.add(Dropout(0.5))
 
    model.add(Dense(units=n_units, activation="softmax"))
    
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [9]:
def create_model2(input_shape, n_units):
    model = Sequential()
    model.add(Input(shape=input_shape))
    
    model.add(Flatten())

    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(units=n_units, activation="softmax"))
    
    opt = Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [10]:
def convNet(shape, n_units):
    model = Sequential()

    model.add(Conv2D(input_shape=shape, filters=16, kernel_size=(5,5), strides=(2,2), activation='relu'))
    model.add(BatchNormalization())

    model.add(Conv2D(filters=32, kernel_size=(3,3), strides=(2,2), activation='relu'))
    model.add(BatchNormalization())
    
    model.add(Flatten())

    model.add(Dense(units=716,activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(units=716,activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(units=n_units, activation="softmax"))
    adam = Adam(lr=0.001)

    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

    return model


In [11]:
def vgg16(shape, n_units):
    base_model = VGG16(weights = "imagenet", include_top=False, input_shape = shape)
    base_model.trainable = False
    inputs = keras.Input(shape=shape)
    
    model = Sequential()
    
    model.add(base_model)
    
    model.add(Flatten())

    model.add(Dense(units=32,activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(units=16,activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(units=n_units, activation="softmax"))
    adam = Adam(lr=0.1)

    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

    return model
    

In [12]:
def cnn_lstm(shape, n_units):
    model = Sequential()
    
    # 1
    model.add(Conv2D(input_shape=shape, filters=64, kernel_size=(3,3), strides=(1,1), padding='same'))
    model.add(BatchNormalization())
    model.add(ELU())
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
    
    #2
    model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding='same'))
    model.add(BatchNormalization())
    model.add(ELU())
    model.add(MaxPool2D(pool_size=(4, 4), strides=(4, 4)))
    
    #3
    model.add(Conv2D(filters=128, kernel_size=(3,3), strides=(1,1), padding='same'))
    model.add(BatchNormalization())
    model.add(ELU())
    model.add(MaxPool2D(pool_size=(4, 4), strides=(4, 4)))
    
    #4
    model.add(Conv2D(filters=128, kernel_size=(3,3), strides=(1,1), padding='same'))
    model.add(BatchNormalization())
    model.add(ELU())
    model.add(MaxPool2D(pool_size=(4, 4), strides=(4, 4)))
    
    #Reshape output for lstm
    model.add(Reshape((-1,128)))
    
    #LSTM
    model.add(LSTM(units=256, return_sequences=True))
    
    #Softmax
    model.add(Flatten())
    model.add(Dense(units=n_units, activation="softmax"))
    adam = Adam(lr=0.01)

    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

    return model


### Code and trainig

In [13]:
labels_path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release/Session1/dialog/EmoEvaluation/'
wav_path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release/Session1/dialog/wav/'
# read the pickle and the csv

labels_df = pd.read_csv("/kaggle/working/df_iemocap.csv")
#audio_vectors = pickle.load(open('/kaggle/working/audio_vectors1.pkl', 'rb'))
#labeled_features_df = pd.read_csv("/kaggle/working/audios_features.csv")

# labels_df = read_labels(labels_path)
audio_vectors = read_audio(wav_path, labels_df)

#labeled_features_df = extract_audio_features(audio_vectors, labels_df, emotion_dict)
#labeled_features_df = preprocess(audio_vectors, labels_df, emotion_dict)

# x = pickle.load(open('/kaggle/working/x1.pkl', 'rb'))
# y = pickle.load(open('/kaggle/working/y1.pkl', 'rb'))

x, y, shape = features_2d(audio_vectors, labels_df, emotion_dict)

#x = labeled_features_df.drop(columns=['label', 'file_name'])

enc = OneHotEncoder()
y = enc.fit_transform(np.asarray(y).reshape(-1,1)).toarray()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### MLP MODEL
#model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
# # print(x_train[0].shape)
# x_train = np.array(x_train).reshape(-1,1)
# y_train = np.array(y_train).reshape(-1,1)
#print('shape is ',x_train.shape,' ',y_train.shape)

# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
# print("Accuracy: {:.2f}%".format(accuracy * 100))

print('create model')
model = cnn_lstm((shape[0], shape[1], 3), len(y[0]))
model.build()
print(model.summary())
print('fit model')

model.fit(x_train, y_train, epochs = 100, verbose=1, validation_data=(x_test, y_test), steps_per_epoch = len(x_train) // 32)

100%|██████████| 28/28 [00:10<00:00,  2.76it/s]
1819it [07:20,  4.13it/s]


(4000, 128, 251, 3)
done
create model
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 251, 64)      1792      
                                                                 
 batch_normalization (BatchN  (None, 128, 251, 64)     256       
 ormalization)                                                   
                                                                 
 elu (ELU)                   (None, 128, 251, 64)      0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 64, 125, 64)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 64, 125, 64)       36928     
                                                                 
 batch_normalizati

  super(Adam, self).__init__(name, **kwargs)


None
fit model
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 

<keras.callbacks.History at 0x7fc7736a53d0>