In [1]:
%load_ext autoreload
%autoreload 2


import tensorflow as tf
from tensorflow  import keras
import tensorflow_hub as hub
import numpy as np

import librosa 
from util import WavDataset
import matplotlib.pyplot as plt

In [28]:
sr = 16_000

def compute_frame_labels(label_tensor, frame_length=int(sr*0.96), step_size=int(sr*0.48), threshold=0.15):
    n_labels, total_samples = label_tensor.shape
    n_frames = total_samples // step_size 
    
    frame_labels = np.zeros((n_labels, n_frames), dtype=int)
    
    for i in range(n_frames):
        start = i * step_size
        end = start + frame_length
        frame = label_tensor[:, start:end]
        
        # is there >15% annotations in the frame
        frame_label = (np.mean(frame, axis=1) >= threshold).astype(int)
        frame_labels[:, i] = frame_label
    
    return frame_labels

Y = np.zeros((4, 16_000 * 5))
Y[0, 32_000:] = 1
Y[2, :32_000] = 1
Y[3, -8_000:] = 1
compute_frame_labels(Y)

array([[0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]])

In [3]:
yamnet_url = 'https://tfhub.dev/google/yamnet/1'
yamnet_layer = hub.KerasLayer(yamnet_url, input_shape=(None,), dtype=tf.float32, trainable=False)

21:14:31 INFO Using /tmp/tfhub_modules to cache modules.
2024-09-08 21:14:31.781994: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [35]:
import h5py
from config import *
from pathlib import Path

np.random.seed(0)

def get_chunk_data(dataset, chunk):
    rec, s, e = chunk
    Y = dataset[rec]['Y']  
    samples = dataset[rec]['X']

    # error is here
    s_slice = np.array(samples[s:e])
    y_slice = np.array(Y[:, s:e])
    
    if y_slice.shape != (4, 80000):
        raise ValueError(y_slice.shape, s_slice.shape, rec, s, e)
    
    y_frames = compute_frame_labels(y_slice)
    return s_slice.T, y_frames.T

def chunk_generator():
    chunk_len = 5
    overlap = 1
    sr = 16_000
    hdf5_file = INTERMEDIATE / 'train.hdf5'
    chunk_len *= sr
    overlap *= sr
    
    with h5py.File(hdf5_file, 'r') as hdf5_dataset:

        # get chunk info 
        chunks = []
        for rec in list(hdf5_dataset):
            Y = hdf5_dataset[rec]['Y']     
            n_samples = Y.shape[1]
            chunks += [
                (rec, start, start + chunk_len) 
                for start in range(0, n_samples - chunk_len, chunk_len - overlap)
            ]

        # shuffle all chunks
        np.random.shuffle(chunks)
    
        # generate chunks
        for random_chunk in chunks:
            rec, s, e = random_chunk
            yield get_chunk_data(hdf5_dataset, random_chunk)
            

raw_dataset = tf.data.Dataset.from_generator(
    chunk_generator,
    output_signature=(
        tf.TensorSpec(shape=(None), dtype=tf.float32),
        tf.TensorSpec(shape=(10, 4), dtype=tf.bool)))

def get_embeddings(sample_chunk, Y_chunk): # ! most computation is spend here
    _, emb, _ = yamnet_layer(sample_chunk)
    return emb, Y_chunk 

full_dataset = raw_dataset.map(lambda x, y: get_embeddings(x, y))

for s in full_dataset.take(32):
    X, Y = s
    if (X.shape, Y.shape) != ((10, 1024), (10, 4)):
        print("!! -> ", X.shape, Y.shape)
        break

133921


In [32]:
# inp = keras.Input(shape=(16_000 * 5))
# _, embeddings, _ = YAMnet(inp)
# output = keras.layers.Dense(4, activation='sigmoid')(embeddings)
# model = keras.Sequential(inputs=inp, outputs=output)

# model = tf.keras.Sequential([
#     yamnet_base,
#     # tf.keras.layers.GlobalAveragePooling1D(),  # Reduce the dimensionality
#     tf.keras.layers.Dense(4, activation='softmax')  # Output layer for classification
# ])

from keras.layers import Input, Dense

# inputs = layers.Input(shape=(9, 1024), dtype=tf.float32)
# x = layers.Dense(256, activation='relu')(inputs)
# x = layers.Dense(128, activation='relu')(x)
# x = layers.Dense(64, activation='relu')(x)
# outputs = layers.Dense(4, activation='sigmoid')(x)
# model = tf.keras.Model(inputs=inputs, outputs=outputs)

model = keras.Sequential(
    [
        Input(shape=(10, 1024)),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(4, activation='sigmoid')
    ]
)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 10, 256)           262400    
                                                                 
 dense_5 (Dense)             (None, 10, 128)           32896     
                                                                 
 dense_6 (Dense)             (None, 10, 64)            8256      
                                                                 
 dense_7 (Dense)             (None, 10, 4)             260       
                                                                 
Total params: 303,812
Trainable params: 303,812
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 133921 chunks
# 4185 batches

batched = full_dataset.batch(32)
history = model.fit(batched, epochs=1)

133921
     15/Unknown - 5s 5s/step - loss: 8.2241e-05 - accuracy: 0.0000e+ - 10s 5s/step - loss: 6.1324e-05 - accuracy: 0.0000e+0 - 15s 5s/step - loss: 5.2027e-05 - accuracy: 0.0000e+0 - 19s 5s/step - loss: 0.0071 - accuracy: 0.0000e+00    - 24s 5s/step - loss: 0.0057 - accuracy: 0.0000e+0 - 28s 5s/step - loss: 0.0048 - accuracy: 0.0000e+0 - 33s 5s/step - loss: 0.0041 - accuracy: 0.0000e+0 - 38s 5s/step - loss: 0.0036 - accuracy: 0.0000e+0 - 42s 5s/step - loss: 0.0090 - accuracy: 3.4722e-0 - 47s 5s/step - loss: 0.0081 - accuracy: 0.0050    - 53s 5s/step - loss: 0.0074 - accuracy: 0.008 - 58s 5s/step - loss: 0.0109 - accuracy: 0.010 - 63s 5s/step - loss: 0.0129 - accuracy: 0.010 - 68s 5s/step - loss: 0.0120 - accuracy: 0.010 - 74s 5s/step - loss: 0.0112 - accuracy: 0.0094