In [81]:
# data manipulation
import numpy as np
import pandas as pd

# progress bar
from tqdm.notebook import tqdm_notebook

# audio manipulation
import librosa

# display things
from IPython import display
import matplotlib.pyplot as plt

# tensorflow stuff
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

# convert label to integer
from sklearn.preprocessing import LabelEncoder

# split in train/test data
from sklearn.model_selection import train_test_split

from tensorflow.keras.metrics import AUC

# Load Yamnet and audio samples

In [26]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [44]:
metadata = pd.read_csv("processed_metadata.csv")
np.shape(metadata)

(6944, 7)

# Test

In [7]:
sample, sr = librosa.load(metadata.iloc[0]["path"], sr=None, mono=True)
# Play the audio file.
display.Audio(sample,rate=16000)

In [8]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

In [9]:
scores, embeddings, spectrogram = yamnet_model(sample)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.argmax(class_scores)
inferred_class = class_names[top_class]

print(f'The main sound is: {inferred_class}')
print(f'The embeddings shape: {embeddings.shape}')

The main sound is: Speech
The embeddings shape: (20, 1024)


# Pre-processing dataset

Extract the label and convert into numerical features

In [28]:
# Transforming non numerical labels into numerical labels
labels = metadata["label"]
encoder = LabelEncoder()

# encoding train labels
y = encoder.fit_transform(labels)
y

array([0, 0, 0, ..., 2, 2, 2])

In [45]:
metadata["y"] = y
metadata = metadata.sample(frac=1, random_state=100)
metadata.head()

Unnamed: 0,path,segment,mod,label,samplingrate,ms,samples,y
6725,./Labelled/Others/evt_000_003_063391_210513_08...,1,2,Others,16000,3000.0,processed_samples/6725.wav,2
2140,./Labelled/ContactCalls/evt_002_003_079489_210...,4,0,ContactCalls,16000,3000.0,processed_samples/2140.wav,1
5789,./Labelled/ContactCalls/evt_002_001_074446_210...,4,1,ContactCalls,16000,3000.0,processed_samples/5789.wav,1
5135,./Labelled/ContactCalls/evt_001_003_018953_210...,1,2,ContactCalls,16000,3000.0,processed_samples/5135.wav,1
726,./Labelled/Bleatings/evt_000_003_009620_210419...,0,0,Bleatings,16000,3000.0,processed_samples/726.wav,0


Randomly split data in train/validation/test with a 60%/20%/20% split

In [52]:
fold = np.zeros(len(metadata))

# first split to obtain train + test
train, test = train_test_split(metadata, test_size=0.2, stratify=y)
test_idx = test.index.tolist()
for i in test_idx:
    fold[i] = 2

In [56]:
# second split to obtain  train + validation
train, validation = train_test_split(train, test_size=0.25, stratify=train["y"])
validation_idx = validation.index.tolist()
for i in validation_idx:
    fold[i] = 1

In [57]:
pd.DataFrame(fold).value_counts()

0.0    4166
1.0    1389
2.0    1389
dtype: int64

In [58]:
metadata["fold"] = fold
metadata.head()

Unnamed: 0,path,segment,mod,label,samplingrate,ms,samples,y,fold
6725,./Labelled/Others/evt_000_003_063391_210513_08...,1,2,Others,16000,3000.0,processed_samples/6725.wav,2,0.0
2140,./Labelled/ContactCalls/evt_002_003_079489_210...,4,0,ContactCalls,16000,3000.0,processed_samples/2140.wav,1,0.0
5789,./Labelled/ContactCalls/evt_002_001_074446_210...,4,1,ContactCalls,16000,3000.0,processed_samples/5789.wav,1,0.0
5135,./Labelled/ContactCalls/evt_001_003_018953_210...,1,2,ContactCalls,16000,3000.0,processed_samples/5135.wav,1,2.0
726,./Labelled/Bleatings/evt_000_003_009620_210419...,0,0,Bleatings,16000,3000.0,processed_samples/726.wav,0,0.0


##  Prepare data for YAMNnet

Let's define two functions. The first is used to load the audio samples from their path, the latter to extract the mid-term features from YAMNnet

In [64]:
def load_wav_for_map(filename, label, fold):
    """
    Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio."""
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    
    return wav, label, fold

def extract_embedding(wav_data, label, fold):
    ''' applies YAMNet to extract embedding from the wav data '''
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings,
            tf.repeat(label, num_embeddings),
            tf.repeat(fold, num_embeddings))


Make a tensor extracting only the usefull column from metadata

In [65]:
filenames = metadata['samples']
targets = metadata['y']
folds = metadata['fold']

main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets, folds))
main_ds.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [66]:
list(main_ds.as_numpy_iterator())[:5]

[(b'processed_samples/6725.wav', 2, 0.0),
 (b'processed_samples/2140.wav', 1, 0.0),
 (b'processed_samples/5789.wav', 1, 0.0),
 (b'processed_samples/5135.wav', 1, 2.0),
 (b'processed_samples/726.wav', 0, 0.0)]

Load audio data in the tensor from their path

In [67]:
main_ds = main_ds.map(load_wav_for_map)
main_ds.element_spec





(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

Feed each audio sample to YAMNnet and export the embedding features in the tensor

In [68]:
# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds.element_spec

(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

Finally divide the data in train/validation/test tensors.

In [69]:
cached_ds = main_ds.cache()
train_ds = cached_ds.filter(lambda embedding, label, fold: fold == 0)
val_ds = cached_ds.filter(lambda embedding, label, fold: fold == 1)
test_ds = cached_ds.filter(lambda embedding, label, fold: fold == 2)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds = train_ds.map(remove_fold_column)
val_ds = val_ds.map(remove_fold_column)
test_ds = test_ds.map(remove_fold_column)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

# Feed Forward Model

The next step is define the FF model to train on top of the embeddings

In [93]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
], name='my_model')

my_model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 1539      
Total params: 526,339
Trainable params: 526,339
Non-trainable params: 0
_________________________________________________________________


Compile and fit the model

In [94]:
my_model.compile(loss="categorical_crossentropy",
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor="loss", mode="min", patience=5, restore_best_weights=True)

In [95]:
history = my_model.fit(train_ds,
                       epochs=50,
                       validation_data=val_ds,
                       callbacks=callback)

Epoch 1/50


ValueError: in user code:

    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\engine\training.py:853 train_function  *
        return step_function(self, iterator)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\engine\training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\engine\training.py:835 run_step  **
        outputs = model.train_step(data)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\engine\training.py:788 train_step
        loss = self.compiled_loss(
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\engine\compile_utils.py:201 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\losses.py:141 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\losses.py:245 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\losses.py:1665 categorical_crossentropy
        return backend.categorical_crossentropy(
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\Nicolas Facchinetti\.conda\envs\progapr\lib\site-packages\keras\backend.py:4839 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\Nicolas Facchinetti\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 3) are incompatible


In [36]:
loss, accuracy = my_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  4.693575382232666
Accuracy:  0.46406251192092896


In [39]:
scores, embeddings, spectrogram = yamnet_model(sample)
result = my_model(embeddings).numpy()

result

array([[-3.8989975, -2.4609382,  5.22075  ],
       [-3.550753 , -2.6697001,  5.231956 ],
       [-3.8163903, -3.8123784,  3.9993355],
       [-3.6392324, -3.2858505,  4.9577637],
       [-3.9828198, -2.9644022,  5.331527 ],
       [-7.8536735, -6.148269 ,  7.651672 ],
       [-4.4192176, -3.1238225,  6.059081 ],
       [-4.216107 , -2.717183 ,  6.1340322],
       [-3.596152 , -1.8110838,  4.993352 ],
       [-4.655001 , -2.6294389,  5.6115794],
       [-4.7516475, -3.747163 ,  6.361213 ],
       [-5.4048896, -4.169555 ,  6.37632  ],
       [-5.500675 , -4.025247 ,  6.5098023],
       [-3.3068006, -1.9992609,  3.027211 ],
       [-2.0638216, -3.8398607,  0.661803 ],
       [-2.8353307, -1.802026 ,  4.0528092],
       [-4.353722 , -2.8579116,  6.0163465],
       [-3.2781126, -4.197918 ,  2.4091923],
       [-3.3510053, -5.6749897,  3.2377594],
       [-2.565227 , -2.9833746,  2.6066797]], dtype=float32)

In [43]:
result.mean(axis=0)

array([-4.0519786, -3.3460186,  4.8225107], dtype=float32)

In [42]:
result.mean(axis=0).argmax()

2

In [41]:
encoder.inverse_transform(result.mean(axis=0).argmax())

ValueError: y should be a 1d array, got an array of shape () instead.

In [None]:
inferred_class = my_classes[result.mean(axis=0).argmax()]
print(f'The main sound is: {inferred_class}')