In [73]:
import pandas as pd
import numpy as np

import os
import sys

import librosa
import librosa.display
from google.colab import files

import joblib
import tensorflow
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder


import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Ravdess = "/content/drive/MyDrive/Colab Notebooks/speech-emotion/dataset/ravdess/"
# ravdess_directory_list = os.listdir(Ravdess)
# for dir in ravdess_directory_list:
#   print(dir)

In [62]:
Model_path ="/content/drive/MyDrive/Colab Notebooks/speech-emotion/codes/ser-model.keras"
Scalar_path ="/content/drive/MyDrive/Colab Notebooks/speech-emotion/codes/scaler.pkl"
Encoder_path ="/content/drive/MyDrive/Colab Notebooks/speech-emotion/codes/encoder.pkl"

emotion_map = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fear',
    7: 'disgust',
    8: 'surprise'
}

In [63]:
uploaded = files.upload()


Saving 03-01-04-02-02-01-21.wav to 03-01-04-02-02-01-21.wav


In [65]:
original_filename = list(uploaded.keys())[0]
print("Original filename:", original_filename)
name_part = original_filename.split('.')[0]
name_part = name_part.split('-')
emotion_val = (int(name_part[2]))
emotion = emotion_map.get(emotion_val)
print(emotion, emotion_val)
data, sr = librosa.load(original_filename, duration=2.8, offset=0.6)
display(Audio(data=data, rate=sr))

Original filename: 03-01-04-02-02-01-21.wav
sad 4


In [66]:
def get_features(data, sample_rate):
    # Extract Zero Crossing Rate feature; the rate at which signal changes its sign
    feature_values = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    feature_values=np.hstack((feature_values, zcr))

    # Calculate mean of chroma short-time fourier transform; Time-frequency representation of energy distribution
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    feature_values = np.hstack((feature_values, chroma_stft))

    # Calculate mean of Mel-Frequency Cepstral Coefficients; Represents short-term power spectrum and provides timbre of sound
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    feature_values = np.hstack((feature_values, mfcc))

    # Get Root Mean Square Value of wave signal
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    feature_values = np.hstack((feature_values, rms))

    # Calculate mean of MelSpectogram; Represents audio signal's power by providing square of amplitude
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    feature_values = np.hstack((feature_values, mel))

    return feature_values

In [67]:
encoder = joblib.load(Encoder_path)
scaler = joblib.load(Scalar_path)
model = load_model(Model_path)

X, Y = [], []
X.append(get_features(data, sr))
Y.append(emotion)

X = scaler.transform(X)
X = np.expand_dims(X, axis=2)



In [71]:
pred_test = model.predict(X)
y_pred = encoder.inverse_transform(pred_test)

print('Actual:', emotion, ', Predicted:', y_pred.flatten()[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Actual: sad , predicted: sad
