In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import pickle
import re
import librosa
import librosa.display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import soundfile
from keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.preprocessing import OneHotEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Global Variables

In [2]:
sr = 44100

emotion_dict = {'ang': 0,
                'hap': 1,
                'exc': 2,
                'sad': 3,
                'fru': 4,
                'fea': 5,
                'sur': 6,
                'neu': 7,
                'dis': 8,
                'xxx': 9,
                'oth': 9}

### Reading Labels

In [3]:
def read_labels(datapath):
    min_frame = 999999
    info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)

    start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []

    evaluation_files = [l for l in os.listdir(datapath) if 'Ses' in l]
    for file in evaluation_files:
        with open(datapath + file) as f:
            content = f.read()

        info_lines = re.findall(info_line, content)

        for line in info_lines[1:]:  # the first line is a header
            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\t')
            start_time, end_time = start_end_time[1:-1].split('-')

            val, act, dom = val_act_dom[1:-1].split(',')
            val, act, dom = float(val), float(act), float(dom)

            start_time, end_time = float(start_time), float(end_time)
            min_frame = min(min_frame, end_time - start_time)

            start_times.append(start_time)
            end_times.append(end_time)
            wav_file_names.append(wav_file_name)
            emotions.append(emotion)
            vals.append(val)
            acts.append(act)
            doms.append(dom)

    df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])

    df_iemocap['start_time'] = start_times
    df_iemocap['end_time'] = end_times
    df_iemocap['wav_file'] = wav_file_names
    df_iemocap['emotion'] = emotions
    df_iemocap['val'] = vals
    df_iemocap['act'] = acts
    df_iemocap['dom'] = doms

    df_iemocap.to_csv('df_iemocap.csv', index=False)

    return df_iemocap, min_frame

### Reading audio files

In [4]:
def read_audio(wav_file_path, labels_df, min_frame):

    audio_vectors = {}
    orig_wav_files = os.listdir(wav_file_path)
    
    for orig_wav_file in tqdm(orig_wav_files):
        try:
            with soundfile.SoundFile(wav_file_path + orig_wav_file) as sound_file:
                
                orig_wav_vector = sound_file.read(dtype="float32")
                sr=sound_file.samplerate
                
                #orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)
                orig_wav_vector = orig_wav_vector.reshape(-1, 1)

                orig_wav_file, file_format = orig_wav_file.split('.')

                for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():
                    frames = []
                    start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row[
                        'end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']

                    start_frame = math.floor(start_time * sr)
                    end_frame = math.floor(end_time * sr)
                    
                    truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1].reshape(-1, 1)
                    
                    for i in range(math.floor((end_time-start_time)/min_frame)):
                        startf_time = i * min_frame
                        endf_time = startf_time + min_frame
                        
                        start_frame = math.floor(startf_time * sr)
                        end_frame = math.floor(endf_time * sr)
                        
                        frame = truncated_wav_vector[start_frame:end_frame + 1].reshape(-1, 1)
                        frames.append(frame)
                    
                    audio_vectors[truncated_wav_file_name] = frames
        except:
            print('')

    with open('audio_vectors1.pkl', 'wb') as f:
        pickle.dump(audio_vectors, f)


    return audio_vectors


### Extracting features

In [5]:
def extract_audio_features(audio_vectors, labels_df, emotion_dict):
    file_name, x, y = [], [], []
    i=0
    for index, row in tqdm(labels_df.iterrows()):
        i+=1
        wav_file_name = row['wav_file']
        label = emotion_dict[row['emotion']]
        audio = audio_vectors[wav_file_name]
    
        for frame in audio:
            mfcc = np.array(librosa.feature.mfcc(y=frame, sr=sr))
            mfcc_mean = mfcc.mean(axis=1)
            mfcc_min = mfcc.min(axis=1)
            mfcc_max = mfcc.max(axis=1)
            mfcc_feature = np.concatenate( (mfcc_mean, mfcc_min, mfcc_max) )
     
            file_name.append(wav_file_name)
            x.append(mfcc_feature.reshape(1, -1).tolist()[0])
            y.append(label)
        
        if i == 267:
            break
    
    df = pd.concat([pd.DataFrame(file_name, columns=["file_name"]), pd.DataFrame(x), pd.DataFrame(y, columns=["label"])], axis=1)
    #df.to_csv('audios_features.csv', index=False)
    
    return df

### Keras Model

In [6]:
def create_model(input_shape, n_units):
    model = Sequential()
    model.add(Input(shape=input_shape))
    
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    
    model.add(Dropout(0.5))
 
    model.add(Dense(units=n_units, activation="softmax"))
    
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Code and model

In [7]:
labels_path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release/Session1/dialog/EmoEvaluation/'
wav_path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release/Session1/dialog/wav/'
# read the pickle and the csv

# labels_df = pd.read_csv("/kaggle/working/df_iemocap.csv")
# audio_vectors = pickle.load(open('/kaggle/working/audio_vectors1.pkl', 'rb'))
#labeled_features_df = pd.read_csv("/kaggle/working/audios_features.csv")

labels_df, min_frame = read_labels(labels_path)
audio_vectors = read_audio(wav_path, labels_df, min_frame)
labeled_features_df = extract_audio_features(audio_vectors, labels_df, emotion_dict)

x = labeled_features_df.drop(columns=['label', 'file_name'])

enc = OneHotEncoder()
y = enc.fit_transform(np.asarray(labeled_features_df['label']).reshape(-1,1)).toarray()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### MLP MODEL
#model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
# # print(x_train[0].shape)
# x_train = np.array(x_train).reshape(-1,1)
# y_train = np.array(y_train).reshape(-1,1)
#print('shape is ',x_train.shape,' ',y_train.shape)

#model.fit(x_train, y_train)
#y_pred = model.predict(x_test)
#accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
#print("Accuracy: {:.2f}%".format(accuracy * 100))

print('create model')
model = create_model((None, x_train.shape[1]), len(y[0]))
print('fit model')

model.fit(x_train, y_train, epochs = 1000, verbose=2, validation_data=(x_test, y_test))

100%|██████████| 28/28 [00:00<00:00, 33.40it/s]
  return f(*args, **kwargs)
266it [13:45,  3.10s/it]


create model
fit model
Epoch 1/100
36/36 - 2s - loss: 4971.0356 - accuracy: 0.2085 - val_loss: 1.9970 - val_accuracy: 0.1979 - 2s/epoch - 53ms/step
Epoch 2/100
36/36 - 0s - loss: 1.9428 - accuracy: 0.2233 - val_loss: 1.8848 - val_accuracy: 0.3264 - 356ms/epoch - 10ms/step
Epoch 3/100
36/36 - 0s - loss: 1.8597 - accuracy: 0.2815 - val_loss: 1.8189 - val_accuracy: 0.3264 - 429ms/epoch - 12ms/step
Epoch 4/100
36/36 - 0s - loss: 1.8163 - accuracy: 0.2815 - val_loss: 1.7869 - val_accuracy: 0.3264 - 351ms/epoch - 10ms/step
Epoch 5/100
36/36 - 0s - loss: 1.7955 - accuracy: 0.2815 - val_loss: 1.7703 - val_accuracy: 0.3264 - 357ms/epoch - 10ms/step
Epoch 6/100
36/36 - 0s - loss: 1.7858 - accuracy: 0.2815 - val_loss: 1.7621 - val_accuracy: 0.3264 - 356ms/epoch - 10ms/step
Epoch 7/100
36/36 - 0s - loss: 1.7811 - accuracy: 0.2815 - val_loss: 1.7580 - val_accuracy: 0.3264 - 356ms/epoch - 10ms/step
Epoch 8/100
36/36 - 0s - loss: 1.7778 - accuracy: 0.2815 - val_loss: 1.7561 - val_accuracy: 0.3264 - 3

<keras.callbacks.History at 0x7fecdc1d87d0>