In [1]:
import os
import numpy as np
import librosa

# Chemin du dossier principal
data_path = r"C:\Users\lenovo\Desktop\digits_sound"

# Paramètres
hop_length = 512
n_fft = 2048
sr = 22050
fixed_length = 45100  # Longueur fixe des vecteurs 1D

# Stockage des données
X = []
y = []

counter = 0 
# Parcourir les sous-dossiers (mots)
for word in os.listdir(data_path):
    word_path = os.path.join(data_path, word)
    
    if os.path.isdir(word_path):  
        print(f"Traitement du mot : {word}")
        for file in os.listdir(word_path):
            if file.endswith(".wav"):
                file_path = os.path.join(word_path, file)

           
                signal, _ = librosa.load(file_path, sr=sr)
                
                
                stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
                spectrogram = np.abs(stft)
                spectrogram = librosa.amplitude_to_db(spectrogram)
                spectrogram_flat = spectrogram.flatten()
                
                if len(spectrogram_flat) > fixed_length:
                    spectrogram_flat = spectrogram_flat[:fixed_length] 
                else:
                    spectrogram_flat = np.pad(spectrogram_flat, (0, fixed_length - len(spectrogram_flat)))  

                X.append(spectrogram_flat)
                y.append(word)
                counter = counter + 1 

X = np.array(X)
y = np.array(y)

print("DONE")

Traitement du mot : 0_zero
Traitement du mot : 1_one
Traitement du mot : 2_two
Traitement du mot : 3_three
Traitement du mot : 4_four
Traitement du mot : 5_five
Traitement du mot : 6_six
Traitement du mot : 7_seven
Traitement du mot : 8_eight
Traitement du mot : 9_nine
DONE


In [2]:
import os
import numpy as np
import librosa

X = np.array([])
y = np.array([])

with np.load("data/digits_data.npz") as data :
    X = data["first"]
    y = data["second"]

In [7]:
np.unique(y)

array(['eight', 'five', 'four', 'nine', 'one', 'seven', 'six', 'three',
       'two', 'zero'], dtype='<U5')

In [8]:
np.savez("data/words.npz" , y = np.unique(y) )

In [2]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encodage des labels en nombres
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  

# Séparation en entraînement et test
X_train , X_test , y_train , y_test = train_test_split(X ,
                                                       y_encoded,
                                                       test_size = 0.2 ,
                                                       random_state = 42,
                                                       shuffle = True,
                                                       stratify = y_encoded
                                                       )

In [3]:
np.savez("db_digit.npz" , X_train = X_train , X_test = X_test , y_train = y_train , y_test = y_test ) 

In [6]:

# Construction du modèle MLP
model = keras.Sequential([
    keras.layers.Input(shape=(X.shape[1],)),  # Entrée avec shape (fixed_length,)
    
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.3),
    
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.3),
    
    keras.layers.Dense(128, activation='relu'),
    
    keras.layers.Dense(len(set(y)), activation='softmax')  # Sortie avec softmax pour classification
])

# Compilation du modèle
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Affichage du résumé
model.summary()






In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


In [None]:
import matplotlib.pyplot as plt
def plot_history(history):

    fig, axs = plt.subplots(2)

    # create accuracy sublpot
    axs[0].plot(history.history["accuracy"], label="train accuracy")
    axs[0].plot(history.history["val_accuracy"], label="test accuracy")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend(loc="lower right")
    axs[0].set_title("Accuracy eval")

    # create error sublpot
    axs[1].plot(history.history["loss"], label="train error")
    axs[1].plot(history.history["val_loss"], label="test error")
    axs[1].set_ylabel("Error")
    axs[1].set_xlabel("Epoch")
    axs[1].legend(loc="upper right")
    axs[1].set_title("Error eval")

    plt.show()



plot_history(model.history)


array(['eight', 'five', 'four', 'nine', 'one', 'seven', 'six', 'three',
       'two', 'zero'], dtype='<U5')

In [3]:
import sounddevice as sd
import numpy as np
import librosa
import tensorflow as tf
import os

with np.load("data/words.npz") as data :
    words = data["y"]

model = tf.keras.models.load_model("./models_last/last_model_e20_acc78_.keras")


duration = 1  
sr = 22050  
n_fft = 2048
hop_length = 512
fixed_length = 45100  


def record_audio(duration, sr):
    print("🎤 Enregistrement en cours...")
    audio = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype=np.float32)
    sd.wait()
    print("✅ Enregistrement terminé.")
    return audio.flatten()

def extract_features(signal, sr):
    stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(stft)
    spectrogram_flat = spectrogram.flatten()

    if len(spectrogram_flat) > fixed_length:
        spectrogram_flat = spectrogram_flat[:fixed_length]
    else:
        spectrogram_flat = np.pad(spectrogram_flat, (0, fixed_length - len(spectrogram_flat)))

    return np.array([spectrogram_flat]) 

def report(prediction):
    test = []
    values = prediction[0]
    for i in range ( 10 ) : 
        value = np.round ( values[i] * 100 , 2 )
        value = round(float(value) , 2 )
        word = str(words[i])
        test.append(( value , word ))
    values_sorted = sorted(test , key = lambda item : item[0] * -1 )

    print ("prediction from high to low probability")
    print()
    for i in range ( 10 ) : 
        print (f"{values_sorted[i][1]} is {values_sorted[i][0]} % ")


    print ( "so the prediction word is " , values_sorted[0][1] )
        
    


In [6]:
audio_signal = record_audio(duration, sr)
X_test = extract_features(audio_signal, sr)
prediction = model.predict(X_test)
predicted_word = words[np.argmax(prediction)] 

print(f"🗣️ Mot prédit : {predicted_word}")

🎤 Enregistrement en cours...
✅ Enregistrement terminé.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
🗣️ Mot prédit : five


In [7]:
report(prediction)

prediction from high to low probability

five is 25.06 % 
nine is 17.47 % 
one is 13.1 % 
four is 11.97 % 
seven is 10.99 % 
zero is 5.64 % 
six is 4.53 % 
three is 4.33 % 
two is 4.0 % 
eight is 2.9 % 
so the prediction word is  five


np.float32(14.55)