Étape 1 : Charger les fichiers audio (.wav)

In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# 📁 Chemin vers le dossier RAVDESS (à adapter selon ton environnement)
dataset_path = '../../objects/datasets/ravdess'

# 🧠 Dictionnaire pour mapper le code des émotions
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# 🗃️ Listes pour stocker les features et labels
features = []
labels = []

# 🔁 Parcourir chaque acteur
for actor_folder in os.listdir(dataset_path):
    actor_path = os.path.join(dataset_path, actor_folder)
    
    if os.path.isdir(actor_path):
        for filename in os.listdir(actor_path):
            if filename.endswith('.wav'):
                file_path = os.path.join(actor_path, filename)

                # 🎧 Charger l'audio
                y, sr = librosa.load(file_path, sr=None)

                # 🎯 Extraire l’émotion à partir du nom du fichier
                emotion_code = filename.split('-')[2]
                emotion = emotion_map.get(emotion_code)

                # 🎵 Extraire les MFCCs (on garde la dimension temporelle)
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                features.append(mfcc.T)  # Transposer pour garder la dimension temporelle
                labels.append(emotion)

# 🔍 Normaliser les MFCCs
scaler = StandardScaler()
features = [scaler.fit_transform(mfcc) for mfcc in features]

# 🎯 Convertir les labels en index numériques
emotion_index = {emotion: idx for idx, emotion in enumerate(emotion_map.values())}
labels = np.array([emotion_index[label] for label in labels])

# 🧪 Padding des séquences pour uniformiser la longueur
max_length = max([len(mfcc) for mfcc in features])
features = [np.pad(mfcc, ((0, max_length - len(mfcc)), (0, 0)), mode='constant') for mfcc in features]

# 🧠 Convertir les features en tableau numpy
X = np.array(features)
y = tf.keras.utils.to_categorical(labels, num_classes=len(emotion_map))

# 🔁 Diviser les données en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🏋️‍♂️ Calculer les poids des classes pour gérer l'imbalancement
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(labels), y=labels)

# 📐 Définir le modèle CNN
model = models.Sequential([
    layers.InputLayer(input_shape=(X_train.shape[1], X_train.shape[2])),  # Séquence temporelle de MFCCs
    layers.Conv1D(64, 5, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(128, 3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(256, 3, activation='relu', padding='same'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(emotion_map), activation='softmax')
])

# 📊 Compiler le modèle
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 🏃‍♂️ Entraîner le modèle
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test), class_weight={i: weight for i, weight in enumerate(class_weights)})

# Sauvegarder le modèle
model.save('model_audio.h5')

# ✅ Évaluation du modèle
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"✅ Précision sur les données de test : {test_accuracy:.4f}")




Epoch 1/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 154ms/step - accuracy: 0.1281 - loss: 2.2766 - val_accuracy: 0.2569 - val_loss: 1.8587
Epoch 2/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 144ms/step - accuracy: 0.2620 - loss: 1.8811 - val_accuracy: 0.4688 - val_loss: 1.6461
Epoch 3/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 133ms/step - accuracy: 0.4307 - loss: 1.5920 - val_accuracy: 0.5069 - val_loss: 1.4717
Epoch 4/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 128ms/step - accuracy: 0.5239 - loss: 1.3193 - val_accuracy: 0.5382 - val_loss: 1.3444
Epoch 5/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 137ms/step - accuracy: 0.6137 - loss: 1.0905 - val_accuracy: 0.5451 - val_loss: 1.2098
Epoch 6/15
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 135ms/step - accuracy: 0.6540 - loss: 0.9364 - val_accuracy: 0.5556 - val_loss: 1.2427
Epoch 7/15
[1m36/36[0m [



[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.6823 - loss: 1.0740
✅ Précision sur les données de test : 0.6493
