In [None]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns

In [None]:
# Define the main path for the dataset
Main_WAV_Path = Path("../input/toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data")

In [None]:
# Extract MFCC features function
def extract_mfcc_features(audio_path_or_waveform, sr=None, max_pad_len=500):
    if isinstance(audio_path_or_waveform, str):  # If input is a file path
        y, sr = librosa.load(audio_path_or_waveform, sr=sr)
    else:  # If input is a raw waveform
        y = audio_path_or_waveform
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)  # Rich features
    
    # Padding to ensure consistent input size
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    
    return mfccs.T

In [None]:
# Load and preprocess data
def load_data(dataset_path, augment=False):
    features = []
    labels = []
    
    for label in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue
        
        for file in os.listdir(label_path):
            if file.endswith(".wav"):  # Assuming audio files are in WAV format
                file_path = os.path.join(label_path, file)
                y, sr = librosa.load(file_path, sr=None)
                mfcc_features = extract_mfcc_features(y, sr=sr)
                features.append(mfcc_features)
                labels.append(label)
                
                if augment:
                    # Data Augmentation
                    features.append(extract_mfcc_features(add_noise(y), sr=sr))
                    features.append(extract_mfcc_features(time_shift(y), sr=sr))
                    labels.extend([label] * 2)  # Same label for augmented samples

    # Convert labels to numerical values
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    
    # Pad or truncate features to have consistent shape
    max_length = max([x.shape[0] for x in features])
    features = [np.pad(x, ((0, max_length - x.shape[0]), (0, 0)), mode='constant') for x in features]
    
    return np.array(features), np.array(labels), label_encoder

# Augmentation methods
def add_noise(y, noise_factor=0.005):
    noise = np.random.randn(len(y))
    y_noise = y + noise_factor * noise
    y_noise = np.clip(y_noise, -1.0, 1.0)
    return y_noise

def time_shift(y, shift_max=2):
    shift = random.randint(-shift_max, shift_max)
    return np.roll(y, shift)

In [None]:
# Load the data
emotions = os.listdir(Main_WAV_Path)
X, y, label_encoder = load_data(Main_WAV_Path, augment=True)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Flatten features for SVM
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

In [None]:
# Build the SVM model pipeline
svm_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', random_state=42, probability=True)  # Using RBF kernel for non-linearity
)

In [None]:
# Train the SVM model
svm_model.fit(X_train_flattened, y_train)

In [None]:
# Evaluate on test set
y_pred = svm_model.predict(X_test_flattened)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
# Predict emotions for custom audio files
def predict_emotion_from_file(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    mfcc_features = extract_mfcc_features(audio, sr)
    mfcc_features_flattened = mfcc_features.flatten().reshape(1, -1)
    prediction = svm_model.predict(mfcc_features_flattened)
    predicted_emotion = label_encoder.inverse_transform(prediction)
    print(f"Predicted Emotion for {file_path}: {predicted_emotion[0]}")

In [None]:
# Example files
files_to_predict = [
    "/kaggle/input/cremad/AudioWAV/1001_DFA_DIS_XX.wav",
    "/kaggle/input/cremad/AudioWAV/1001_DFA_HAP_XX.wav",
    "/kaggle/input/cremad/AudioWAV/1001_IEO_DIS_HI.wav",

]

for file in files_to_predict:
    predict_emotion_from_file(file)