## Imports

In [1]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import scipy
import tensorflow_io as tfio
import csv


2024-08-22 12:20:36.526722: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-22 12:20:36.530581: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-22 12:20:36.544333: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-22 12:20:36.564731: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-22 12:20:36.570905: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-22 12:20:36.588662: I tensorflow/core/platform/cpu_feature_gu

In [2]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [3]:
# Utility functions for loading audio files and making sure the sample rate is correct.

@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    
    return wav


In [4]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

for name in class_names[:20]:
  print(name)
print('...')


Speech
Child speech, kid speaking
Conversation
Narration, monologue
Babbling
Speech synthesizer
Shout
Bellow
Whoop
Yell
Children shouting
Screaming
Whispering
Laughter
Baby laughter
Giggle
Snicker
Belly laugh
Chuckle, chortle
Crying, sobbing
...


## Data set

### Record your own data

In [59]:
import pyaudio
import wave
import os
import time

# Constants
CHUNK = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
DURATION = 10  # Duration of each recording in seconds
OUTPUT_FOLDER = "/home/focus/Bureau/Audio_Node/src/resources/recordings"  # Folder where recordings will be saved

# Create the output folder if it does not exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

def record_audio(file_path):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    frames = []

    print("Recording...")
    for _ in range(int(RATE / CHUNK * DURATION)):
        data = stream.read(CHUNK, exception_on_overflow=False)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    with wave.open(file_path, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

def main():
    index = 1
    while True:
            file_path = os.path.join(OUTPUT_FOLDER, f"{index}.wav")
            record_audio(file_path)
            index += 1
            time.sleep(1)  # Optional delay before starting the next recording
        

if __name__ == "__main__":
    main()


### Download from audio Set


    root_path: the path to the directory where the dataset will be downloaded.
    labels: a list of labels to download. If None, all labels will be downloaded.
    n_jobs: the number of parallel downloads. Default is 1.
    download_type: the type of download. It can be one of the following:
        balanced_train: balanced train set.
        unbalanced_train: unbalanced train set. This is the default
        eval: evaluation set.
    copy_and_replicate: if True if a file is associated to multiple labels, it will be copied and replicated for each label. If False, it will be associated to the first label in the list. Default is True.

The methods of the class are:

    download(format='vorbis', quality=5): downloads the dataset.
    The format can be one of the following (supported by yt-dlp --audio-format parameter):
        vorbis: downloads the dataset in Ogg Vorbis format. This is the default.
        wav: downloads the dataset in WAV format.
        mp3: downloads the dataset in MP3 format.
        m4a: downloads the dataset in M4A format.
        flac: downloads the dataset in FLAC format.
        opus: downloads the dataset in Opus format.
        webm: downloads the dataset in WebM format.
        ... and many more.
        The quality can be an integer between 0 and 10. Default is 5.
    read_class_mapping(): reads the class mapping file. It is not used externally.
    download_file(...): downloads a single file. It is not used externally.


### Clean downloaded Data

In [None]:
import os
import shutil

# Set your source and destination directories
source_dir = "audioset-download/audioset"
destination_dir = "/home/focus/Bureau/Audio_Node/src/resources/ray_audio/data/test"

# Create destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Set the size threshold (in bytes)
size_threshold = 150 * 1024  # 150 KB

# Loop through each folder and file in the source directory
for root, dirs, files in os.walk(source_dir):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            # Check the file size
            if os.path.getsize(file_path) >= size_threshold:
                # Move the file to the destination directory
                shutil.move(file_path, os.path.join(destination_dir, file))
            else:
                # Delete the file if it's smaller than the threshold
                os.remove(file_path)

print("Operation completed.")


In [6]:
import os
import pandas as pd

def create_dataframe_from_directories(base_path, class_folders):
    # List to hold information about each file
    data = []
    
    # Loop through each class folder
    for class_name in class_folders:
        class_path = os.path.join(base_path, class_name)
        if os.path.isdir(class_path):
            # List all .wav files in the class folder
            for filename in os.listdir(class_path):
                if filename.endswith('.wav'):
                    file_path = os.path.join(class_path, filename)
                    data.append({'filename': file_path, 'category': class_name})
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Example usage
base_data_path = '/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/' #Base path for your training data
class_folders = ['Speech', 'Silence', 'Music', 'Beep', 'Robot_moving'] #your classes, each class is in a folder
df = create_dataframe_from_directories(base_data_path, class_folders)


In [7]:
# Define your classes and map them to IDs
my_classes = ['Speech', 'Silence', 'Music', 'Beep', 'Robot_moving']
map_class_to_id = {cls: idx for idx, cls in enumerate(my_classes)}

# Filter DataFrame for specified classes (if needed)
filtered_df = df[df['category'].isin(my_classes)]

# Map class names to IDs
filtered_df['class_id'] = filtered_df['category'].apply(lambda name: map_class_to_id[name])

# Optionally, add a full path column if needed
#filtered_df['full_path'] = filtered_df['filename'].apply(lambda row: os.path.abspath(row))

# Display the DataFrame
print(filtered_df.head(550))


                                              filename      category  class_id
0    /home/focus/Bureau/Audio_Node/src/resources/ra...        Speech         0
1    /home/focus/Bureau/Audio_Node/src/resources/ra...        Speech         0
2    /home/focus/Bureau/Audio_Node/src/resources/ra...        Speech         0
3    /home/focus/Bureau/Audio_Node/src/resources/ra...        Speech         0
4    /home/focus/Bureau/Audio_Node/src/resources/ra...        Speech         0
..                                                 ...           ...       ...
479  /home/focus/Bureau/Audio_Node/src/resources/ra...  Robot_moving         4
480  /home/focus/Bureau/Audio_Node/src/resources/ra...  Robot_moving         4
481  /home/focus/Bureau/Audio_Node/src/resources/ra...  Robot_moving         4
482  /home/focus/Bureau/Audio_Node/src/resources/ra...  Robot_moving         4
483  /home/focus/Bureau/Audio_Node/src/resources/ra...  Robot_moving         4

[484 rows x 3 columns]


In [8]:

filenames = filtered_df['filename'].tolist()
targets = filtered_df['class_id'].tolist()

main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets))
main_ds=main_ds.shuffle(buffer_size=1000)

# Function to print dataset labels
def print_dataset_labels(dataset, num_samples=500):
    for i, (filename, target) in enumerate(dataset.take(num_samples)):

            print(f"Sample {i+1}: Filename = {filename.numpy()}, Class ID = {target.numpy()}")
        
print_dataset_labels(main_ds)

Sample 1: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Speech/-QcXMPUNMeM_0.0-10.0.wav', Class ID = 0
Sample 2: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Beep/CBFq-G-Tigg_0.0-10.0.wav', Class ID = 3
Sample 3: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Beep/Aw2twjnBhBg_0.0-10.0.wav', Class ID = 3
Sample 4: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Silence/-GB9OL7pf-E_320.0-330.0.wav', Class ID = 1
Sample 5: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Speech/10mtAupS0I4_0.0-10.0.wav', Class ID = 0
Sample 6: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Speech/-QivqOnaPIc_100.0-110.0.wav', Class ID = 0
Sample 7: Filename = b'/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/train/Beep/j2bpPTTUdVw_0.0-2.0.wav', Class ID = 3
Sample 8: Filename = b'/home/focus/Bureau/Audio_

2024-08-19 16:06:02.709463: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
def load_wav_for_map(filename, label):
  return load_wav_16k_mono(filename), label

main_ds = main_ds.map(load_wav_for_map)
main_ds.element_spec


2024-08-19 16:06:10.281253: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA






(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

In [10]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
            tf.repeat(label, num_embeddings))
            

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds.element_spec


(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

In [11]:


# Shuffle the dataset
shuffled_ds = main_ds.shuffle(buffer_size=len(filenames), reshuffle_each_iteration=False)

# Define the split proportions
train_size = int(0.9 * len(filenames))
val_size = int(0.1 * len(filenames))


# Split the dataset
train_ds = shuffled_ds.take(train_size)
remaining_ds = shuffled_ds.skip(train_size)
val_ds = remaining_ds.take(val_size)


# Print dataset specs for verification
print("Train dataset:", train_ds.element_spec)
print("Validation dataset:", val_ds.element_spec)



Train dataset: (TensorSpec(shape=(1024,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))
Validation dataset: (TensorSpec(shape=(1024,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))


In [12]:
train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE).repeat()
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)




tf.io.read_file

<function tensorflow.python.ops.io_ops.read_file(filename, name=None)>

## Model Training

In [13]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024,), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='my_model')

my_model.summary()


In [14]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)


In [15]:
history = my_model.fit(train_ds,
                       epochs=20,
                       steps_per_epoch=300,
                       validation_data=val_ds,
                       callbacks=callback)


Epoch 1/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 34ms/step - accuracy: 0.8155 - loss: 0.5940 - val_accuracy: 0.5208 - val_loss: 3.1140
Epoch 2/20
[1m 11/300[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 16ms/step - accuracy: 0.9768 - loss: 0.0586

2024-08-19 16:08:14.897593: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(value)


[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9735 - loss: 0.1093 - val_accuracy: 0.5417 - val_loss: 3.4315
Epoch 3/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9813 - loss: 0.0401 - val_accuracy: 0.5417 - val_loss: 3.8751
Epoch 4/20
[1m 12/300[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 15ms/step - accuracy: 0.9866 - loss: 0.0307

2024-08-19 16:08:23.701801: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9826 - loss: 0.0402 - val_accuracy: 0.5417 - val_loss: 4.0949
Epoch 5/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9843 - loss: 0.0376 - val_accuracy: 0.5417 - val_loss: 4.2414
Epoch 6/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9858 - loss: 0.0331 - val_accuracy: 0.5417 - val_loss: 4.4233
Epoch 7/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9844 - loss: 0.0336 - val_accuracy: 0.5417 - val_loss: 4.6178
Epoch 8/20
[1m  9/300[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 15ms/step - accuracy: 0.9856 - loss: 0.0260

2024-08-19 16:08:41.243898: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9853 - loss: 0.0327 - val_accuracy: 0.5417 - val_loss: 4.7574
Epoch 9/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9889 - loss: 0.0291 - val_accuracy: 0.5417 - val_loss: 4.7961
Epoch 10/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9855 - loss: 0.0321 - val_accuracy: 0.5417 - val_loss: 5.0159
Epoch 11/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9871 - loss: 0.0393 - val_accuracy: 0.5625 - val_loss: 5.0311
Epoch 12/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9880 - loss: 0.0298 - val_accuracy: 0.5625 - val_loss: 5.1151
Epoch 13/20
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9841 - loss: 0.0335 - val_accuracy: 0.5625 - val_loss: 5.2319


## Testing model

In [16]:
wav=load_wav_16k_mono('/home/focus/Bureau/Audio_Node/src/resources/ray_audio/fdata/test/Silence/-v5tNN6YADM_0.0-10.0.wav')





In [35]:
scores, embeddings, spectrogram = yamnet_model(wav)
result = my_model(embeddings).numpy()

inferred_class = my_classes[result.mean(axis=0).argmax()]
print(f'My model :The main sound is: {inferred_class}')

scores, embeddings, spectrogram = yamnet_model(wav)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.argmax(class_scores)
inferred_class = class_names[top_class]

print(f'Yamnet : The main sound is: {inferred_class}')


My model :The main sound is: Speech
Yamnet : The main sound is: Silence


## Save model

In [37]:
my_model.save('saving_costum_yamnett/my_model.keras') #Path to save your model


## Load model

In [3]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [5]:
loaded_model=tf.keras.models.load_model('saving_costum_yamnet/my_model.keras')

## Testing model

In [6]:
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    
    return wav

In [7]:
my_classes = ['Speech', 'Silence', 'Music', 'Beep', 'Robot_moving']
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

In [56]:
wav=load_wav_16k_mono('/home/focus/Bureau/Audio_Node/src/resources/Transfer_learning_yamnet/test_wav/beep_for_mymodel.wav')





In [57]:
display.Audio(wav,rate=16000)

In [58]:
embeddings= yamnet_model(wav)[1]
result = loaded_model(embeddings).numpy()

inferred_class = my_classes[result.mean(axis=0).argmax()]
print(f'My model :The main sound is: {inferred_class}')

scores, embeddings, spectrogram = yamnet_model(wav)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.argmax(class_scores)
inferred_class = class_names[top_class]

print(f'Yamnet : The main sound is: {inferred_class}')


My model :The main sound is: Beep
Yamnet : The main sound is: Silence


As you can see, my model surpasses yamnet in some fields , and gets surpassed in some other, this is dew to the limited data used for fine tuning, to acheive better results we can simply provide more data for the model

 ## Real time detection

In [18]:
import pyaudio

In [19]:

# Audio stream parameters
import collections
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000  # YAMNet expects 16kHz audio
CHUNK = 6000  # Number of audio samples per frame
#model = hub.load('https://tfhub.dev/google/yamnet/1')
audio_buffer = collections.deque(maxlen=RATE)

def process_audio(audio_data):
    # Convert audio data to float32
    audio_data = audio_data.astype(np.float32)
    # Normalize audio data
    audio_data = audio_data / 32768.0

    # Make predictions (highlight this if you dont want to see yamnet)
    scores, embeddings, spectrogram = yamnet_model(audio_data)
    scores_np = scores.numpy()
    spectrogram_np = spectrogram.numpy()
    infered_class = class_names[scores_np.mean(axis=0).argmax()]
    print(f'Yamnet: {infered_class}')
    # Post-process the scores
    # predictions = np.mean(scores, axis=0)
    # top_class = np.argmax(predictions)
    # top_score = predictions[top_class]
    # make predictions with my model
    result = loaded_model(embeddings).numpy()
    my_classes = ['Speech', 'Silence', 'Music', 'Beep', 'Robot_moving']
    mymodel_class = my_classes[result.mean(axis=0).argmax()]
    print(f'My model : {mymodel_class}')

    #print(f"Predicted class: {top_class}, Score: {top_score}")

def audio_callback(in_data, frame_count, time_info, status):
    # Process the audio data here
    audio_data = np.frombuffer(in_data, dtype=np.int16)
    # You can pass this data to your model for prediction
    process_audio(audio_data)
    return (in_data, pyaudio.paContinue)

p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK,
                stream_callback=audio_callback)

stream.start_stream()

try:
    while stream.is_active():
        pass  # Keep the stream active
except KeyboardInterrupt:
    pass

stream.stop_stream()
stream.close()
p.terminate()

# Load the YAMNet model



ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Stomach rumble
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Inside, small room
My model : Silence
Yamnet: Hiccup
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Stomach rumble
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Silence
My model : Silence
Yamnet: Speech
My model : Silen

## Time stamp

In [87]:
from collections import defaultdict
from scipy.io import wavfile

In [46]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform


In [47]:
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_names = []
  with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      class_names.append(row['display_name'])

  return class_names

In [54]:
from collections import defaultdict
from scipy.io import wavfile

In [60]:

wav_file_name = '/home/focus/Bureau/Audio_Node/src/resources/recordings/beep1'
wav_data=load_wav_16k_mono(wav_file_name)
sample_rate, wav_data = ensure_sample_rate(16000, wav_data)
waveform = wav_data /  np.iinfo(np.int16).max  # Assuming int16 waveform

# Get the model predictions
scores, embeddings, spectrogram = yamnet_model(waveform)  # Assuming 'model' is already loaded

# Load class names
class_map_path = yamnet_model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Process the scores to find presence intervals
top_class_threshold = 0.1
window_duration = 0.96  # Each window is 0.96 seconds
presence_intervals = defaultdict(list)

# Debug: Track top scores per window
top_scores_debug = []

sample_rate, wav_data = wavfile.read(wav_file_name)
sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
waveform = wav_data / np.iinfo(np.int16).max  # Corrected the normalization



track_duration = len(waveform) / sample_rate
result = loaded_model(embeddings).numpy()
my_classes = ['Speech', 'Silence', 'Music', 'Beep', 'Robot_moving']
mymodel_class = my_classes[result.mean(axis=0).argmax()]
top_class_threshold = 0.5
window_duration = 1  # Each window is 0.96 seconds
print(round(track_duration)/window_duration)
for i in range((round(track_duration)//window_duration)+1):
    window_start = i-window_duration-1 * window_duration
    window_end = min(window_start + window_duration, track_duration)  # Ensure window_end does not exceed track duration
    top_class = np.argmax(scores[i])
    top_score = scores[i][top_class]

    if top_score >= top_class_threshold:
        class_name = class_names[top_class]
        print(f"Time Stamp: [{window_start:.2f}s, {window_end:.2f}s], Class: {class_name}, Score: {top_score:.2f}")
        print('my model:',mymodel_class)

20.0
Time Stamp: [-2.00s, -1.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [-1.00s, 0.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [0.00s, 1.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [1.00s, 2.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [2.00s, 3.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [3.00s, 4.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [4.00s, 5.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [5.00s, 6.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [6.00s, 7.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [7.00s, 8.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [8.00s, 9.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [9.00s, 10.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [10.00s, 11.00s], Class: Silence, Score: 1.00
my model: Silence
Time Stamp: [11.00s, 12.00s