In [1]:
import pandas as pd
import numpy as np

import sys
import os

from pathlib import Path
# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# emofilm_info_df = pd.read_csv('D:/Documents/emofilm/data/complete_info.csv')
emofilm_uk_df = pd.read_csv('D:/Documents/emofilm/data/emofilm_uk_df.csv')
emofilm_df = pd.read_csv('D:/Documents/emofilm_df.csv')
ravdess_df = pd.read_csv('D:/Documents/ravdess_df.csv')
savee_df = pd.read_csv('D:/Documents/savee_df.csv')
tess_df = pd.read_csv('D:/Documents/tess_df.csv')
cremad_df = pd.read_csv('D:/Documents/cremad_df.csv')

In [3]:
data_df = pd.concat([emofilm_df, emofilm_uk_df, ravdess_df, savee_df, tess_df, cremad_df],
                    axis=0, ignore_index=True)
data_df

Unnamed: 0,emotion,path
0,fear,D:/Documents/emofilm/data/wav_corpus/f_ans001a...
1,fear,D:/Documents/emofilm/data/wav_corpus/f_ans001a...
2,fear,D:/Documents/emofilm/data/wav_corpus/f_ans001a...
3,fear,D:/Documents/emofilm/data/wav_corpus/f_ans002a...
4,fear,D:/Documents/emofilm/data/wav_corpus/f_ans002a...
...,...,...
13298,contempt,D:/Documents/cremad/AudioWAV/1091_WSI_DIS_XX.wav
13299,fear,D:/Documents/cremad/AudioWAV/1091_WSI_FEA_XX.wav
13300,happiness,D:/Documents/cremad/AudioWAV/1091_WSI_HAP_XX.wav
13301,neutral,D:/Documents/cremad/AudioWAV/1091_WSI_NEU_XX.wav


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13303 entries, 0 to 13302
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emotion  13303 non-null  object
 1   path     13303 non-null  object
dtypes: object(2)
memory usage: 208.0+ KB


In [5]:
emotions_number = data_df.emotion.unique().size
emotions_number

8

In [7]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data


def invert_polarity(data):
    return data * -1


def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)


def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5)*1000)
    return np.roll(data, shift_range)


def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


# taking any example and checking for techniques.
path = np.array(data_df.path)[1]
data, sample_rate = librosa.load(path)

In [8]:
def extract_zcr(data):
    return librosa.feature.zero_crossing_rate(y=data)


def extract_chroma_stft(data, sr):
    return librosa.feature.chroma_stft(y=data, sr=sr)


def extract_mfcc(data, sr):
    return librosa.feature.mfcc(y=data, sr=sr)


def extract_rms(data):
    return librosa.feature.rms(y=data)


def extract_melspectrogram(data, sr):
    return librosa.feature.melspectrogram(y=data, sr=sr, n_fft=512)

In [9]:
# This code was adapted from Nicolas Gervais on https://stackoverflow.com/questions/59241216/padding-numpy-arrays-to-a-specific-size on 1/10/2021
import math
from numpy import ndarray


def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """
    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2, 0)
    aa = max(0, xx - a - h)
    b = max(0, (yy - w) // 2)
    bb = max(yy - b - w, 0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')


def fix_length(data: ndarray, length: int):
    fixed = data.reshape(len(data), 1)
    fixed = padding(fixed, length, 1)
    fixed = fixed.reshape(len(fixed))

    return fixed


def repeat_audio(data: ndarray, sr: int, duration_millis: int):

    length = int(duration_millis/1000*sr)
    n = math.ceil(duration_millis/1000*sr/len(data))
    fixed = np.tile(data, n)
    fixed = fixed[:length]

    return fixed

In [10]:
max_length = 110250
max_i = 14
sample_rate = 22050
max_duration_s = librosa.get_duration(path=data_df['path'][14])
required_duration_millis = 5000

width = 216  # 5 seconds of adio is 216
height = 128  # melspectrogram height is 128

duration = int(required_duration_millis/1000*sample_rate)

In [11]:
a, sr = librosa.load(data_df['path'][2])
print(a.shape, sr)

(54860,) 22050


In [12]:
width2 = int(required_duration_millis/1000*sr)
width2

110250

In [13]:
repeated_audio = repeat_audio(a, sr, required_duration_millis)

print('Array length before padding', np.shape(a))
print('Audio length before padding in seconds', (np.shape(a)[0]/sr))
print('Array length after padding', np.shape(repeated_audio))
print('Audio length after padding in seconds',
      (np.shape(repeated_audio)[0]/sr))
Audio(data=repeated_audio, rate=sr)

Array length before padding (54860,)
Audio length before padding in seconds 2.487981859410431
Array length after padding (110250,)
Audio length after padding in seconds 5.0


In [14]:
def extract_features(data):
    result = np.array([])

    # ZCR
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))  # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft))  # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))  # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))  # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(
        y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel))  # stacking horizontally

    return result


def get_features(path):

    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path)

    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)

    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2))  # stacking vertically

    invert_data = invert_polarity(data)
    res3 = extract_features(invert_data)
    result = np.vstack((result, res3))  # stacking vertically

    return result

In [15]:
X, y = [], []
for i, row in data_df.iterrows():
    features = get_features(row['path'])
    for f in features:
        X.append(f)
        y.append(row['emotion'])


features_df = pd.DataFrame(X)
features_df['labels'] = y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.174738,0.648226,0.677165,0.69226,0.716191,0.665501,0.607874,0.632416,0.642,0.712695,...,0.008625,0.008645,0.00909,0.010604,0.011772,0.008924,0.003265,0.001107,8.4e-05,fear
1,0.257191,0.736847,0.74942,0.779179,0.760325,0.729013,0.669982,0.671527,0.696851,0.762176,...,0.012094,0.011879,0.012337,0.014288,0.015109,0.012017,0.006716,0.004248,0.002999,fear
2,0.174738,0.648226,0.677165,0.69226,0.716191,0.665501,0.607874,0.632416,0.642,0.712695,...,0.008625,0.008645,0.00909,0.010604,0.011772,0.008924,0.003265,0.001107,8.4e-05,fear
3,0.181625,0.64361,0.581965,0.47743,0.558267,0.605189,0.522657,0.561186,0.61469,0.624254,...,0.183382,0.144932,0.060404,0.048087,0.05638,0.072826,0.076891,0.034622,0.004218,fear
4,0.290568,0.726541,0.717606,0.625811,0.651445,0.689244,0.616574,0.595777,0.650725,0.673668,...,0.203479,0.162828,0.0776,0.064426,0.074485,0.091503,0.093595,0.049453,0.020464,fear


In [17]:
features_df.to_csv('features-all-datasets-v2.csv', index=False)
features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.174738,0.648226,0.677165,0.69226,0.716191,0.665501,0.607874,0.632416,0.642,0.712695,...,0.008625,0.008645,0.00909,0.010604,0.011772,0.008924,0.003265,0.001107,8.4e-05,fear
1,0.257191,0.736847,0.74942,0.779179,0.760325,0.729013,0.669982,0.671527,0.696851,0.762176,...,0.012094,0.011879,0.012337,0.014288,0.015109,0.012017,0.006716,0.004248,0.002999,fear
2,0.174738,0.648226,0.677165,0.69226,0.716191,0.665501,0.607874,0.632416,0.642,0.712695,...,0.008625,0.008645,0.00909,0.010604,0.011772,0.008924,0.003265,0.001107,8.4e-05,fear
3,0.181625,0.64361,0.581965,0.47743,0.558267,0.605189,0.522657,0.561186,0.61469,0.624254,...,0.183382,0.144932,0.060404,0.048087,0.05638,0.072826,0.076891,0.034622,0.004218,fear
4,0.290568,0.726541,0.717606,0.625811,0.651445,0.689244,0.616574,0.595777,0.650725,0.673668,...,0.203479,0.162828,0.0776,0.064426,0.074485,0.091503,0.093595,0.049453,0.020464,fear
