In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm import tqdm

In [2]:
cwd = os.getcwd()
path = cwd.replace("\\",'/')

In [3]:
path

'C:/Users/ml2ai/Desktop/Manish/Manish code files/Metagogy/vber_reavidas'

In [4]:
df_final =pd.read_csv('vber_data.csv')

In [5]:
#### Extracting MFCC's For every audio file
audio_dataset_path=path+"/dataset"
metadata=df_final
metadata.head()

Unnamed: 0.1,Unnamed: 0,classes,locations
0,0,neutral,C:\Users\ml2ai\Desktop\Manish\Manish code file...
1,1,neutral,C:\Users\ml2ai\Desktop\Manish\Manish code file...
2,2,neutral,C:\Users\ml2ai\Desktop\Manish\Manish code file...
3,3,neutral,C:\Users\ml2ai\Desktop\Manish\Manish code file...
4,4,calm,C:\Users\ml2ai\Desktop\Manish\Manish code file...


In [6]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# taking any example and checking for techniques.
path = np.array(metadata.locations)[1]
data, sample_rate = librosa.load(path)

In [7]:
sample_rate

22050

In [8]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [9]:
X, Y = [], []
for path, emotion in zip(metadata.locations, metadata.classes):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

In [10]:
len(X), len(Y)

(4320, 4320)

In [11]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('vber_features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.300781,0.706492,0.756853,0.735217,0.731014,0.715156,0.667529,0.673754,0.722173,0.748969,...,4.319263e-06,3e-06,2e-06,2.286677e-06,5.131694e-06,8.065748e-06,5e-06,2.270858e-06,1.642365e-07,neutral
1,0.333008,0.805021,0.840313,0.821007,0.822327,0.829499,0.720681,0.677058,0.727558,0.760922,...,0.000113675,0.000112,0.000114,0.0001147467,0.0001090023,0.0001177636,0.00012,0.0001187017,0.0001099673,neutral
2,0.176693,0.640605,0.712837,0.743736,0.714797,0.701402,0.694569,0.663853,0.688527,0.742508,...,7.999367e-07,1e-06,1e-06,5.350822e-07,3.480662e-07,8.398366e-07,1e-06,4.287777e-07,2.878995e-08,neutral
3,0.271272,0.674966,0.723259,0.724594,0.681302,0.670643,0.674574,0.630036,0.680146,0.708276,...,6.998011e-06,7e-06,7e-06,6.999257e-06,1.21788e-05,9.449916e-06,8e-06,2.638513e-06,1.788902e-07,neutral
4,0.303828,0.751402,0.778547,0.80619,0.777157,0.766285,0.760808,0.658329,0.684237,0.730115,...,4.45399e-05,4.4e-05,4.3e-05,4.428105e-05,4.915007e-05,4.657156e-05,4.4e-05,3.983445e-05,3.691425e-05,neutral
