In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
from config import *
from preprocessing import SpectrogramSequence
from util import Dataset

ds = Dataset(DATA_ROOT)
annotations_df: pd.DataFrame = pd.read_csv(ANNOTATIONS / 'initial_dataset_7depl_metadata.csv')
has_annotations = "1_20230316_063000.wav"
has_annotations_path = ds.get_data_path(1, 1) / has_annotations

DEBUG:config:Debug logging active


In [15]:

# NOTE paper has only 40 freq bins for chunk (i have >500 - too high sr?)

def CRNN(input_shape, num_classes, n_filters):
    freq_len, time_len, _ = input_shape
    
    model = keras.Sequential()
    model.add(layers.Input(shape=input_shape))
    
    # CNN layers
    for filters in n_filters:
        model.add(layers.Conv2D(filters, (3, 3), activation='relu', padding='same', strides=(2, 1)))
        model.add(layers.MaxPooling2D((2, 1))) # frequency pooling only

    model.add(layers.Flatten())
    model.add(layers.Reshape((time_len, -1))) # input matrix for RNN shape=(new_features, time_frames)

    # RNN layers
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.Dense(num_classes, activation='sigmoid'))  
    
    return model


model = CRNN(input_shape=(512, 428, 1), num_classes=4, n_filters=[32, 64, 128, 256])
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy',
    metrics=['accuracy']
)

keras.utils.plot_model(model, to_file = FIGURES_DIR / 'model.png', 
                       show_shapes=True,  expand_nested=True, rankdir='TR')

print(model.summary())

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_52 (Conv2D)          (None, 256, 428, 32)      320       
                                                                 
 max_pooling2d_52 (MaxPoolin  (None, 128, 428, 32)     0         
 g2D)                                                            
                                                                 
 conv2d_53 (Conv2D)          (None, 64, 428, 64)       18496     
                                                                 
 max_pooling2d_53 (MaxPoolin  (None, 32, 428, 64)      0         
 g2D)                                                            
                                                                 
 conv2d_54 (Conv2D)          (None, 16, 428, 128)      73856     
                                                                 
 max_pooling2d_54 (MaxPoolin  (None, 8, 428, 128)    

shuffle annotations but keep recordings clumped
in order to maintain class balance between train/test/valid (deployments are inbalanced)


In [3]:
grouped = annotations_df.groupby('recording')

shuffled_groups = list(grouped)  
np.random.shuffle(shuffled_groups)  

shuffled_df = pd.concat([df for _, df in shuffled_groups], ignore_index=True)

shuffled_df.reset_index(drop=True, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(shuffled_df, test_size=0.2, shuffle=False)
train_df, validation_df = train_test_split(train_df, test_size=0.2, shuffle=False)

for i in (validation_df, test_df, train_df):
    print(i.shape)

(351, 13)
(438, 13)
(1401, 13)


In [5]:
sr = 22_000
batch = 32

train_sequence = SpectrogramSequence(annotations_df=train_df, sr=sr, batch_size=batch)
test_sequence = SpectrogramSequence(annotations_df=test_df, sr=sr, batch_size=batch)
validation_sequence = SpectrogramSequence(annotations_df=validation_df, sr=sr, batch_size=batch)

for s in (train_sequence, test_sequence, validation_sequence):
    print(len(s.chunk_info))

n# frames in chunk:  428


preparing data: 100%|██████████| 188/188 [00:01<00:00, 98.46it/s] 


n# frames in chunk:  428


preparing data: 100%|██████████| 69/69 [00:00<00:00, 95.22it/s] 


n# frames in chunk:  428


preparing data: 100%|██████████| 46/46 [00:00<00:00, 120.11it/s]

11280
4140
2757





In [6]:
batch_X, batch_Y = train_sequence.__getitem__(0)
print(batch_X[0].shape, batch_Y[0].shape)

loading new recording: 1_20230317_063000.wav 

  samplerate, s = wavfile.read(recording_path)


 -> Time Taken: 1.996114 seconds
resampling 1_20230317_063000.wav  -> Time Taken: 2.076659 seconds
stft 1_20230317_063000.wav  -> Time Taken: 0.741147 seconds
converting to db 1_20230317_063000.wav 

  y = 10*log10(x)   # take log


 -> Time Taken: 1.491110 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


Total: 6.32 seconds
(512, 428) (428, 4)


In [7]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 3396), started 0:19:07 ago. (Use '!kill 3396' to kill it.)

: 

In [9]:
epochs = 30

# early_stopping_cb = tf.keras.callbacks.EarlyStopping(
#     monitor='val_prc',
#     verbose=1,
#     patience=30,
#     mode='max',
#     restore_best_weights=True
# )

checkpoint_path = MODEL_DIR / 'training_1' / 'crnn.ckpt'
Path(checkpoint_path).parent.mkdir(exist_ok=True)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

import datetime
now_str =  datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + now_str
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(
    train_sequence, 
    epochs=epochs, 
    validation_data=validation_sequence, 
    callbacks=[tensorboard_callback, cp_callback])

df = pd.DataFrame(history.history)
df.to_csv(log_dir + "hist.csv")
df[['prc', 'val_prc', 'recall', 'val_recall']].plot()

loading new recording: 1_20230317_063000.wav  -> Time Taken: 2.149228 seconds
resampling 1_20230317_063000.wav  -> Time Taken: 2.710153 seconds
stft 1_20230317_063000.wav  -> Time Taken: 0.779940 seconds
converting to db 1_20230317_063000.wav  -> Time Taken: 1.637075 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


Total: 7.29 seconds
Epoch 1/30
loading new recording: 1_20230520_170000.wav  -> Time Taken: 2.173433 seconds
resampling 1_20230520_170000.wav  -> Time Taken: 2.172781 seconds
stft 1_20230520_170000.wav  -> Time Taken: 0.836231 seconds
converting to db 1_20230520_170000.wav  -> Time Taken: 1.515275 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


Total: 6.71 seconds
loading new recording: 1_20230520_073000.wav 

2024-08-17 06:40:02.541162: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 448790528 exceeds 10% of free system memory.
2024-08-17 06:40:03.418320: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 224395264 exceeds 10% of free system memory.
2024-08-17 06:40:03.778693: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 224395264 exceeds 10% of free system memory.


 -> Time Taken: 8.279658 seconds
resampling 1_20230520_073000.wav  -> Time Taken: 8.708125 seconds
stft 1_20230520_073000.wav  -> Time Taken: 1.725249 seconds
converting to db 1_20230520_073000.wav  -> Time Taken: 1.943007 seconds
Total: 20.67 seconds
loading new recording: 1_20230520_170000.wav  -> Time Taken: 8.945539 seconds
resampling 1_20230520_170000.wav 

2024-08-17 06:40:39.782159: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 224395264 exceeds 10% of free system memory.
2024-08-17 06:40:39.791227: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 224395264 exceeds 10% of free system memory.


 -> Time Taken: 14.844863 seconds
stft 1_20230520_170000.wav  -> Time Taken: 8.368051 seconds
converting to db 1_20230520_170000.wav  -> Time Taken: 2.758359 seconds
Total: 35.43 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


  1/352 [..............................] - ETA: 7:35:37 - loss: 0.7979 - accuracy: 0.4736loading new recording: 1_20230417_070000.wav  -> Time Taken: 8.837759 seconds
resampling 1_20230417_070000.wav  -> Time Taken: 16.551825 seconds
  2/352 [..............................] - ETA: 3:06:06 - loss: 0.5818 - accuracy: 0.2380 -> Time Taken: 4.687324 seconds
converting to db 1_20230417_070000.wav  -> Time Taken: 1.854192 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


Total: 32.03 seconds
loading new recording: 1_20230520_070000.wav  -> Time Taken: 5.158581 seconds
resampling 1_20230520_070000.wav  -> Time Taken: 4.374818 seconds
stft 1_20230520_070000.wav  -> Time Taken: 1.641840 seconds
converting to db 1_20230520_070000.wav  -> Time Taken: 2.792213 seconds
Total: 14.02 seconds
loading new recording: 1_20230520_073000.wav  -> Time Taken: 3.681121 seconds
resampling 1_20230520_073000.wav  -> Time Taken: 4.372903 seconds
stft 1_20230520_073000.wav  -> Time Taken: 1.993736 seconds
converting to db 1_20230520_073000.wav  -> Time Taken: 5.652489 seconds


DEBUG:root:batch average spectrogram chunk shape: [512. 428.]
DEBUG:root:batch average Y shape: [428.   4.]


Total: 15.75 seconds
  3/352 [..............................] - ETA: 3:20:53 - loss: 0.4517 - accuracy: 0.4920loading new recording: 1_20230317_073000.wav  -> Time Taken: 3.999090 seconds
resampling 1_20230317_073000.wav  -> Time Taken: 5.491243 seconds
stft 1_20230317_073000.wav  -> Time Taken: 2.657402 seconds
converting to db 1_20230317_073000.wav  -> Time Taken: 3.216522 seconds
Total: 15.40 seconds
loading new recording: 1_20230317_163000.wav  -> Time Taken: 3.962677 seconds
resampling 1_20230317_163000.wav  -> Time Taken: 5.412547 seconds
stft 1_20230317_163000.wav 