In [3]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle
import random

# Function to extract audio features
def extract_features(audio, sr=22050):
    # Extracting features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)

    print(mfccs.shape)

    print(chroma.shape)
    print(mel.shape)
    print(contrast.shape)

    # Concatenating features
    features = np.concatenate([
                                mfccs, 
                               chroma, 
                               mel, 
                               contrast
                              ], axis=0)
    return features

# Path to the directory having dataset
ravdess_dir = r'C:\ML\archive'

# List of emotions in the RAVDESS dataset
emotions = {
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '08': 'surprised'
}

# Loading and preprocessing data
X = []
y = []
for actor_name in os.listdir(ravdess_dir):
    actor_path = os.path.join(ravdess_dir, actor_name)
    print(actor_name)
    audio_files = os.listdir(actor_path)
    # Loop through each audio file in the actor subfolder
    selected_files = random.sample(audio_files, min(50, len(audio_files)))
    for file_name in selected_files:
        file_path = os.path.join(actor_path, file_name)
        if(file_name.split('-')[2] in emotions):
            print("In training: " + file_name)
            emotion = emotions[file_name.split('-')[2]]
            audio, sr = librosa.load(file_path, sr=22050)
            features = extract_features(audio, sr)
            X.append(features)
            y.append(emotion)

# Determine the maximum length of the feature matrices
max_len = max(x.shape[1] for x in X)
print(max_len)

# truncating each feature matrix to the maximum length/making a list
X_padded = []
for x in X:
    if x.shape[1] < max_len:
        padded_x = np.pad(x, ((0, 0), (0, max_len - x.shape[1])), mode='constant')
    else:
        padded_x = x[:, :max_len]
    X_padded.append(padded_x)

# Convert lists to numpy arrays
X = np.array(X_padded)
y = np.array(y)

# Flatten the feature matrices
X_flatten = np.reshape(X, (X.shape[0], -1))

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_flatten, y, test_size=0.2, random_state=42)

# Initialize and train a Support Vector Machine (SVM) classifier
svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train, y_train)

# Evaluate the trained model
y_pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

with open('emotion_svm.pkl', 'wb') as file:
    pickle.dump(svm_clf, file)

Actor_01
In training: 03-01-08-02-01-01-01.wav
(13, 149)
(12, 149)
(128, 149)
(7, 149)
In training: 03-01-04-02-02-01-01.wav
(13, 161)
(12, 161)
(128, 161)
(7, 161)
In training: 03-01-04-02-01-01-01.wav
(13, 164)
(12, 164)
(128, 164)
(7, 164)
In training: 03-01-04-02-01-02-01.wav
(13, 160)
(12, 160)
(128, 160)
(7, 160)
In training: 03-01-05-01-01-02-01.wav
(13, 170)
(12, 170)
(128, 170)
(7, 170)
In training: 03-01-08-01-01-02-01.wav
(13, 141)
(12, 141)
(128, 141)
(7, 141)
In training: 03-01-08-01-02-02-01.wav
(13, 141)
(12, 141)
(128, 141)
(7, 141)
In training: 03-01-03-02-02-02-01.wav
(13, 170)
(12, 170)
(128, 170)
(7, 170)
In training: 03-01-08-02-02-02-01.wav
(13, 141)
(12, 141)
(128, 141)
(7, 141)
In training: 03-01-03-02-02-01-01.wav
(13, 170)
(12, 170)
(128, 170)
(7, 170)
In training: 03-01-04-02-02-02-01.wav
(13, 161)
(12, 161)
(128, 161)
(7, 161)
In training: 03-01-03-01-02-01-01.wav
(13, 153)
(12, 153)
(128, 153)
(7, 153)
In training: 03-01-03-02-01-01-01.wav
(13, 159)
(12, 15

In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[14,  1,  5,  1,  3],
       [ 0, 23,  2,  5,  0],
       [ 0,  0, 13,  3,  3],
       [ 1,  9,  3,  9,  1],
       [ 4,  1,  8,  2, 17]], dtype=int64)

In [22]:
with open('speech_emotion_svm.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

audio, sr = librosa.load('C:/ML/archive/Actor_02/03-01-02-01-01-01-02.wav', sr=None)
input_features = extract_features(audio, sr)

# Pad or truncate the input features
if input_features.shape[1] < max_len:
    padded_input = np.pad(input_features, ((0, 0), (0, max_len - input_features.shape[1])), mode='constant')
else:
    padded_input = input_features[:, :max_len]

predicted_emotion = loaded_model.predict(padded_input.reshape(1, -1))
print("Predicted Emotion:", predicted_emotion[0])

(13, 360)
(12, 360)
(128, 360)
(7, 360)
Predicted Emotion: calm


In [10]:

import sounddevice as sd
from scipy.io.wavfile import write
import pickle
import numpy as np
import librosa

with open('speech_emotion_svm.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

def extract_features(audio, sr=22050):
    # Extracting features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)

    # Concatenating features
    features = np.concatenate([mfccs, chroma, mel, contrast], axis=0)
    return features

def record_audio(output_path, fs, duration):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype=np.int16)
    sd.wait() # Wait until recording is finished
    print("Recording finished.")
    write(output_path, fs, recording) # Save the audio to a WAV file

# Specify the path to save the audio file
output_path = r"C:\audioinput\60.wav"

# Record audio and save it to the specified path
record_audio(output_path, 22050, 5)

audio, sr = librosa.load(output_path, sr=22050)
input_features = extract_features(audio, sr)

print(input_features)

max_len = 220

# Pad or truncate the input features
if input_features.shape[1] < max_len:
    padded_input = np.pad(input_features, ((0, 0), (0, max_len - input_features.shape[1])), mode='constant')
else:
    padded_input = input_features[:, :max_len]

predicted_emotion = loaded_model.predict(padded_input.reshape(1, -1))
print("Predicted Emotion:", predicted_emotion)

Recording...
Recording finished.
[[-6.52150696e+02 -6.51432068e+02 -5.57377625e+02 ... -5.62415039e+02
  -5.59629456e+02 -5.68933960e+02]
 [ 0.00000000e+00 -4.27893877e-01  3.86316528e+01 ...  1.80731354e+01
   2.17339706e+01  2.95022030e+01]
 [ 0.00000000e+00 -3.96255344e-01 -1.95403445e+00 ...  7.35742927e-01
   1.58827877e+00  1.10802326e+01]
 ...
 [ 1.18225640e+01  1.57188823e+01  1.48780397e+01 ...  1.27560374e+01
   1.39547001e+01  1.42089362e+01]
 [ 1.32154338e+01  1.45311452e+01  1.69283173e+01 ...  1.26504454e+01
   1.18929825e+01  1.31777307e+01]
 [ 1.16465700e+01  1.40998848e+01  1.59285969e+01 ...  1.57227560e+01
   1.64975573e+01  1.52822595e+01]]
Predicted Emotion: ['sad']
