In [11]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import pandas as pd
import librosa as lb

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile
import scipy

In [2]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

2024-11-17 15:24:10.078761: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-17 15:24:10.079145: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-17 15:24:10.079229: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-11-17 15:24:10.079297: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-11-17 15:24:10.079361: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

In [4]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_names = []
  with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      class_names.append(row['display_name'])

  return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

In [5]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [50]:
file_path = 'audio/block_length=8+c_method=kmeans+dataset=B+emb=pann+greedy_batch=1+n_clusters=30+n_iter=30+num_block=12+s_method=greedy+s_type=greedy_summary+scen=1+seed_clusters=0+step=summary_summary.csv'
audio_path = 'audio/block_length=8+c_method=kmeans+dataset=B+emb=pann+greedy_batch=1+n_clusters=30+n_iter=30+num_block=12+s_method=greedy+s_type=greedy_summary+scen=1+seed_clusters=0+step=summary_summary.wav'
audio_metadata = pd.read_csv(file_path, sep=',')

In [51]:
audio_metadata['summary_start'] = pd.to_datetime(audio_metadata['summary_start'], infer_datetime_format=True)
audio_metadata['summary_end'] = pd.to_datetime(audio_metadata['summary_end'], infer_datetime_format=True)

In [52]:
audio_chunks = []
desired_sample_rate = 16000 # the data should be sampled at 16000 
for i in range(1, len(audio_metadata) - 1):
    # The subtraction of the datetimes return a deltatime, that we can convert to float
    # by dividing it by timedelta64(1, 's')
    offset = (audio_metadata['summary_start'][i].to_numpy() - np.datetime64('today', 's')) / np.timedelta64(1, 's')    
    duration = ((audio_metadata['summary_end'][i] - audio_metadata['summary_start'][i]).to_numpy() / np.timedelta64(1, 's'))
    chunk, sr = lb.load(audio_path, sr=desired_sample_rate, offset=offset, duration=duration)
    audio_chunks.append((chunk, sr))

Testing for one chunk

In [20]:
# Show some basic information about the audio.
wav_data, sample_rate = audio_chunks[0]
duration = len(wav_data)/sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Listening to the wav file.
Audio(wav_data, rate=sample_rate)

Sample rate: 16000 Hz
Total duration: 5.00s
Size of the input: 80000


The wav_data needs to be normalized to values in [-1.0, 1.0] (as stated in the model's documentation).

In [21]:
waveform = wav_data / tf.int16.max

Running the model

In [22]:
# Run the model, check the output.
scores, embeddings, spectrogram = model(waveform)

In [23]:
scores_np = scores.numpy()
spectrogram_np = spectrogram.numpy()
infered_class = class_names[scores_np.mean(axis=0).argmax()]
print(f'The main sound is: {infered_class}')

The main sound is: Silence


In [47]:
np.array(class_names)[np.argsort(scores_np.mean(axis=0))[::-1]][:4]

array(['Silence', 'Speech', 'Music', 'Inside, small room'], dtype='<U40')

In [54]:
for i, [chunk, sr] in enumerate(audio_chunks):
    model = hub.load('https://tfhub.dev/google/yamnet/1')
    duration = len(chunk)/sr
    print(f'Chunk {i}')
    print(f'Total duration: {duration:.2f}s')

    # The wav_data needs to be normalized to values in [-1.0, 1.0]
    norm_chunk = chunk / tf.int16.max

    scores, embeddings, spectrogram = model(norm_chunk)
    scores_np = scores.numpy()

    preds = np.array(class_names)[np.argsort(scores_np.mean(axis=0))[::-1]][:4]

    for pred in preds:
        print(pred)

    # Listening to the wav file.
    Audio(chunk, rate=sr)

Chunk 0
Total duration: 5.00s
Silence
Speech
Music
Inside, small room
Chunk 1
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 2
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 3
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 4
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 5
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 6
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 7
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 8
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 9
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 10
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
Chunk 11
Total duration: 8.00s
Silence
Speech
Music
Inside, small room
