In [1]:
import ipywidgets as widgets
from IPython.display import display
import sounddevice as sd
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from scipy.io import wavfile as wav
import IPython.display as ipd
import os

In [2]:
def trim_silence(input_file, output_file, threshold=40):
    # Load the audio file
    audio_data, sample_rate = librosa.load(input_file)

    # Extract the non-silent portion
    trimmed_audio_data, _ = librosa.effects.trim(audio_data, top_db=threshold)

    # Save the trimmed audio to a new file
    wav.write(output_file, sample_rate,trimmed_audio_data)

In [3]:
from librosa.feature import melspectrogram, mfcc
def full_mfcc(input, rate = 8000, frames=50):
    mfc = mfcc(y = input*1.0, sr = rate)
    
    # if less than frames, add additional necessary zeroes   
    if mfc.shape[1] < frames:
            pad_width = frames - mfc.shape[1]
            mfc = np.pad(mfc, pad_width=((0, 0), (0, pad_width)))

    # If more than frames, trim 
    elif mfc.shape[1] > frames:
            mfc = mfc[:, :frames]
    
    
    output = np.transpose(mfc)
    return output

In [4]:
mean = np.load('mfcc_mean.npy')
std = np.load('mfcc_std.npy')

def standardize(input):
    eps = 0.001 
    output = (input - mean + eps)/(std + eps) 
    return output

In [5]:
class AudioRecorderApp:
    def __init__(self, model_path):
        self.record_button = widgets.Button(description="Record Audio")
        self.record_button.on_click(self.record_and_predict)

        # Modello pre-addestrato (assicurati di specificare il percorso corretto)
        self.model = load_model(model_path)
        
        display(self.record_button)

    def record_and_predict(self, button):
        # Disabilita il pulsante durante la registrazione
        self.record_button.disabled = True

        # Registra e trimma l'audio
        self.record_and_trim_audio()
        
        sr , trimmed_audio = wav.read("test.wav")
        ipd.display(ipd.Audio(data=trimmed_audio, rate=sr))

        # Calcola le caratteristiche MFCC utilizzando la tua funzione full_mfcc
        mfcc_features = standardize(full_mfcc(trimmed_audio))
        
        # Adatta la forma dei dati per essere compatibile con il modello
        input_data = np.expand_dims(mfcc_features, axis=0)

        # Fai la predizione utilizzando il modello
        predictions = self.model.predict(input_data)

        
        speaker = np.argmax(predictions[0], axis = 1)
        if speaker == 0:
            speaker == "Unknown"
        elif speaker == 1:
            speaker = "Salvatore"
        elif speaker == 2:
            speaker = "Marco"
        digit = np.argmax(predictions[1], axis = 1)
        
        print(f"Lo speaker è {speaker} e ha pronunciato il numero {digit}")
        

       # Abilita nuovamente il pulsante dopo la registrazione
        self.record_button.disabled = False

    def record_and_trim_audio(self, sample_rate=8000, duration=3):
        print("Recording.....\n")
        audio_data = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, dtype='int16')
        sd.wait()
        
        #Save the audio and the trimmed version as WAV file
        
        filename = f"test_raw.wav"
        trimmed = f"test.wav"
        
        wav.write(filename, sample_rate, audio_data)
        trim_silence(filename, trimmed)
        

    def run(self):
        self.root.mainloop()
        

if __name__ == "__main__":
    app = AudioRecorderApp(model_path=os.getcwd() + "\\models\\RNN3.h5")

Button(description='Record Audio', style=ButtonStyle())