In [None]:
!pip install pydub



In [None]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dataset(s) folder path
from google.colab import drive
drive.mount('/content/drive')
data_folder = '/content/drive/Shareddrives/DATA255/audio_speech_actors_01-24' # RAVDESS dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Note:**
- Combine two or more different audio datasets(multilingual) to generalize dataset which can also improve model performance.
- Available datasets are SAVEE, RAVDESS, TESS, and CREMA-D


In [None]:
  # Data Preparation 
  emotions = {
    '01':'neutral',
    '02':'calm',
    '03':'happy',
    '04':'sad',
    '05':'angry',
    '06':'fearful',
    '07':'disgust',
    '08':'surprised'
  }

In [None]:
# Audio data augmentation 
def noise(data):
  noise_amp = 0.05*np.random.uniform()*np.amax(data)  
  data = data.astype('float64') + (noise_amp*np.random.normal(size=data.shape[0]))
  return data

**Note:** 
- Augmentation on the raw audio before producing the spectrogram, or on the generated spectrogram. 
  - Raw data
    - Add Noise
    - Time Shift 
    - Pitch Shift
    - Time Stretch
  - Spectrogram Augmentation
    - Frequency mask
    - Time mask
- Augmenting the spectrogram usually produces better results

In [None]:
# file_name = '/content/drive/MyDrive/Data 255/Project/audio_speech_actors_01-24/Actor_01/03-01-01-01-01-01-01.wav'
# sf = soundfile.SoundFile(file_name)
# x = sf.read(dtype = "float32") 
# x

In [None]:
# sf

In [None]:
# x1 = x.astype('float64') 
# x1

In [None]:
# noise_amp = 0.05*np.random.uniform()*np.amax(x1)
# noise_amp

In [None]:
# np.random.normal(size=x1.shape[0])

In [None]:
# (noise_amp * np.random.normal(size=x1.shape[0])).shape

In [None]:
# x1 + noise_amp * np.random.normal(size=x1.shape[0])

In [None]:
# x1.shape[0]

In [None]:
## Feature extraction (Only MFCC)
def extract_feature(file_name, mfcc, mel):
  with soundfile.SoundFile(file_name) as sound_file:
    X = sound_file.read(dtype = "float32")
    X = noise(X) # data augmentation 
    sample_rate = sound_file.samplerate
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y = X, sr = sample_rate, n_mfcc = 30).T, axis = 0)
        result = np.hstack((result, mfccs))
        
    if mel:
        mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
  return result

**Note:**
- There are many feature extraction techniques mentioned in the blog: https://towardsdatascience.com/how-i-understood-what-features-to-consider-while-training-audio-files-eedfb6e9002b 
- few popular and know ones are 
  - PLP 
  - MFCC
  - LPCC
  - LSF or LSP
  - DWT
  - Zero Crossing rate
  - Croma


In [None]:
# Function to load and split data after feature extraction
def load_data(test_size = 0.2):
  x,y = [],[]
  for file in glob.glob(data_folder+"/Actor_*/*.wav"):
      file_name = os.path.basename(file)
      #converting stereo audio to mono audio
      from pydub import AudioSegment
      sound = AudioSegment.from_wav(file)
      sound = sound.set_channels(1)
      sound.export(file, format="wav")
      
      emotion = emotions[file_name.split("-")[2]]
      feature = extract_feature(file, mfcc = True, mel= False)
      x.append(feature)
      y.append(emotion)
  return train_test_split(np.array(x), y, test_size = test_size, random_state = 9)

In [None]:
# load and split data into training and testing
x_train,x_test,y_train,y_test = load_data(test_size = 0.20) # takes few mins to run (approx 6-8 mins)

In [None]:
print((x_train.shape[0], x_test.shape[0]))
print(f'Features extracted: {x_train.shape[1]}')

(1159, 290)
Features extracted: 30


In [None]:
# MLP model
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(alpha = 0.01, batch_size = 32, hidden_layer_sizes = (64,32,32,16,8), learning_rate = 'adaptive', max_iter = 215)

# Train model with data
MLP.fit(x_train, y_train)

# Predict on test set
y_pred = MLP.predict(x_test)

# Check accuracy of predictions
accuracy = accuracy_score(y_test, y_pred=y_pred)
print("accuracy: %.2f%%" % (accuracy))



accuracy: 0.43%


Pending tasks:
- EDA
- Combine datasets
- Spectrogram Augmentation
- Check Model performance for different feature extractors
- Have to propose few more models apart from MLP such as CNN
- Hyperparameter tunning
- Demo with our own audio speech files(add-on)

Reference: https://github.com/fatihkykc/EmotionRecognitionFromAudio/blob/master/speech_emotion_recognition.ipynb 