In [10]:
!pip install librosa soundfile tensorflow scikit-learn pydub
!pip install librosa soundfile tensorflow scikit-learn pydub





In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [26]:
!pip install pydub




In [11]:
import os
import numpy as np
import librosa
import soundfile as sf
from pydub import AudioSegment
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout


In [12]:
input_folder = "/content/drive/MyDrive/dataset_audio_"
output_folder = "/content/drive/MyDrive/dataset_f"


In [13]:
import os
from pydub import AudioSegment

os.makedirs(output_folder, exist_ok=True)

for subfolder in os.listdir(input_folder):
    input_sub = os.path.join(input_folder, subfolder)
    output_sub = os.path.join(output_folder, subfolder)

    if not os.path.isdir(input_sub):
        continue

    os.makedirs(output_sub, exist_ok=True)

    for file in os.listdir(input_sub):
        if file.lower().endswith((".mp3", ".m4a", ".wav", ".ogg", ".3gp", ".flac")):

            input_path = os.path.join(input_sub, file)
            audio = AudioSegment.from_file(input_path)

            # Standardize: 16kHz, mono, 16bit
            audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)

            new_filename = os.path.splitext(file)[0] + ".wav"
            output_path = os.path.join(output_sub, new_filename)

            audio.export(output_path, format="wav")
            print(f"Converted: {file} → {new_filename}")

print("\n✔️ Conversion complete! WAV files are in:", output_folder)


Converted: bc.ogg → bc.wav
Converted: bzarba4.ogg → bzarba4.wav
Converted: bzarba41.ogg → bzarba41.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.02.48 PM (1).ogg → WhatsApp Ptt 2025-11-13 at 8.02.48 PM (1).wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.03 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.03 PM.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.07 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.07 PM.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.10 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.10 PM.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.16 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.16 PM.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.16 PM (1).ogg → WhatsApp Ptt 2025-11-13 at 8.03.16 PM (1).wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.19 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.19 PM.wav
Converted: WhatsApp Ptt 2025-11-13 at 8.03.23 PM.ogg → WhatsApp Ptt 2025-11-13 at 8.03.23 PM.wav
Converted: WhatsApp Ptt 2025-11-12 at 7.35.25 PM.ogg → WhatsApp Ptt 2025-11-12 at 7.35.25 PM.wav
Converte

In [14]:
import os
import numpy as np
import librosa
import soundfile as sf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


In [16]:
# Noise injection
def add_noise(y, noise_factor=0.005):
    noise = np.random.randn(len(y))
    y_noisy = y + noise_factor * noise
    return y_noisy

# Pitch + speed change
def change_pitch_speed(y, sr, pitch_factor=2.0, speed_factor=1.2):
    y_pitch = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=pitch_factor)
    y_speed = librosa.effects.time_stretch(y_pitch, rate=speed_factor)
    return y_speed

# MFCC + delta + delta-delta
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.concatenate((mfcc, delta, delta2), axis=0)
    combined = np.mean(combined.T, axis=0)
    return combined


In [17]:
DATASET_PATH = "/content/drive/MyDrive/dataset_f"
X = []
y = []
labels = []

for label in os.listdir(DATASET_PATH):
    folder = os.path.join(DATASET_PATH, label)
    if not os.path.isdir(folder):
        continue
    labels.append(label)

    for file in os.listdir(folder):
        if file.endswith(".wav"):
            file_path = os.path.join(folder, file)
            audio, sr = librosa.load(file_path, sr=16000)

            # 1️⃣ Original
            X.append(extract_features(file_path))
            y.append(label)

            # 2️⃣ Noise augmentation
            audio_noise = add_noise(audio)
            temp_path = file_path.replace(".wav", "_noise.wav")
            sf.write(temp_path, audio_noise, sr)
            X.append(extract_features(temp_path))
            y.append(label)

            # 3️⃣ Pitch + Speed augmentation
            audio_aug = change_pitch_speed(audio, sr, pitch_factor=2.0, speed_factor=1.1)
            temp_path2 = file_path.replace(".wav", "_aug.wav")
            sf.write(temp_path2, audio_aug, sr)
            X.append(extract_features(temp_path2))
            y.append(label)

X = np.array(X)
y = np.array(y)
print("✅ Data loaded and augmented:", X.shape, y.shape)


✅ Data loaded and augmented: (1305, 120) (1305,)


In [18]:
from sklearn.model_selection import train_test_split

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_onehot = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_onehot, test_size=0.2, random_state=42
)

# reshape for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1, 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1, 1)

print("✅ Train/Test split done:", X_train.shape, X_test.shape)


✅ Train/Test split done: (1044, 120, 1, 1) (261, 120, 1, 1)


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3,1), activation='relu', input_shape=(X_train.shape[1],1,1)),
    MaxPooling2D((2,1)),
    Conv2D(64, (3,1), activation='relu'),
    MaxPooling2D((2,1)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(labels), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)


Epoch 1/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.2771 - loss: 3.3796 - val_accuracy: 0.4306 - val_loss: 1.2107
Epoch 2/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.3784 - loss: 1.2654 - val_accuracy: 0.5407 - val_loss: 1.0828
Epoch 3/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4878 - loss: 1.1342 - val_accuracy: 0.5024 - val_loss: 1.0683
Epoch 4/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5199 - loss: 1.0244 - val_accuracy: 0.5215 - val_loss: 0.9976
Epoch 5/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5899 - loss: 0.9696 - val_accuracy: 0.5502 - val_loss: 0.9790
Epoch 6/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6242 - loss: 0.9281 - val_accuracy: 0.5933 - val_loss: 0.8995
Epoch 7/50
[1m53/53[0m [32m━━━━

In [21]:
loss, acc = model.evaluate(X_test, y_test)
print("✅ Test Accuracy:", acc)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.7610 - loss: 1.0317
✅ Test Accuracy: 0.7471264600753784


In [29]:
import os

save_folder = "/content/drive/MyDrive/models"
os.makedirs(save_folder, exist_ok=True)

# Chemin complet avec nom fichier et extension
model_path = os.path.join(save_folder, "voice_model_final.h5")  # ou .keras

# Sauvegarder le modèle
model.save(model_path)
print("✅ Model saved at:", model_path)




✅ Model saved at: /content/drive/MyDrive/models/voice_model_final.h5
