In [1]:
import librosa
import numpy as np
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
# 1. Feature Extraction Function
def extract_features(file_name):
    y, sr = librosa.load(file_name, duration=2.5, offset=0.6)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr).T, axis=0)
    return np.hstack([mfccs, chroma, mel, contrast, tonnetz])


In [3]:
# 2. Load Data and Extract Features
dataset_path = 'input/'
features = []
labels = []

# Example with RAVDESS dataset structure: audio_speech_actors_01-24/Actor_*/*.wav
for file in glob.glob(os.path.join(dataset_path, "Actor_*/*.wav")):
    # Extract emotion label from the filename (3rd part of the name convention)
    emotion = int(os.path.basename(file).split('-')[2]) - 1  # RAVDESS labels are 1-8, adjust to 0-7
    features.append(extract_features(file))
    labels.append(emotion)



In [4]:
# 3. Convert to DataFrame and Encode Labels
X = np.array(features)
y = np.array(labels)

In [5]:
# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

# Initialize GridSearchCV
grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=2, cv=5)

# Fit the model
grid.fit(X_train, y_train)

# Best parameters and model
best_params = grid.best_params_
best_model = grid.best_estimator_

print(f"Best Parameters: {best_params}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   2.8s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   2.8s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   3.0s
[CV] END ...................C=0.1, gamma=auto, k

In [7]:
import pickle

# Save the best model
with open('best_svm_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the best parameters
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)


In [8]:
from joblib import dump

# Save the best model
dump(best_model, 'best_svm_model.joblib')

# Save the best parameters
dump(best_params, 'best_params.joblib')


['best_params.joblib']

In [None]:
def augment_data(y, sr):
    # Pitch shifting
    y_pitch = librosa.effects.pitch_shift(y, sr, n_steps=2)
    # Time stretching
    y_stretch = librosa.effects.time_stretch(y, rate=0.8)
    return [y, y_pitch, y_stretch]


for file in glob.glob(os.path.join(dataset_path, "Actor_*/*.wav")):
    emotion = int(os.path.basename(file).split('-')[2]) - 1
    y, sr = librosa.load(file, duration=2.5, offset=0.6)
    
    for aug_y in augment_data(y, sr):
        features.append(extract_features(aug_y))
        labels.append(emotion)


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=emotions))

# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=emotions))


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_model, X, y, cv=10)
print(f"Cross-Validation Accuracy: {np.mean(scores)}")


In [6]:
# 5. Train a Support Vector Machine (SVM) Model
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

In [10]:
# 6. Evaluate the Model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']))

              precision    recall  f1-score   support

     neutral       0.29      0.36      0.32        22
        calm       0.56      0.64      0.60        56
       happy       0.32      0.43      0.36        42
         sad       0.38      0.32      0.35        50
       angry       0.59      0.52      0.55        50
     fearful       0.47      0.41      0.44        39
     disgust       0.43      0.41      0.42        46
   surprised       0.60      0.51      0.55        55

    accuracy                           0.46       360
   macro avg       0.45      0.45      0.45       360
weighted avg       0.47      0.46      0.47       360

