Welcome to the training notebook for the Urban Sound Classification project (NAML-2024 project by Lorenzo Gentile). 

To begin, make sure you have the following modules (the versions listed are the ones tested, may also work on others):
- tensorflow 2.13
- numpy 1.23.5
- pandas 2.0.3
- librosa 0.10.0
- matplotlib 3.7.1
- scikit-learn 1.2.2

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

import random
import pathlib
import datetime
import os

import keras.callbacks as callbacks
from IPython.display import Audio

print("TensorFlow version: ", tf.__version__)
print(tf.config.list_physical_devices())

We will be using two datasets for this project:
- The UrbanSound8K dataset, which contains 8732 labeled sound samples from 10 different classes.
- The ESC-50 dataset, which contains 2000 labeled sound samples from 50 different classes. Only 10 classes are urban sounds, (this subset is called ESC-10), but we will also test the model on the full ESC-50 dataset, to see how it performs on a larger dataset.

In the next block of code, we will define the functions to load the datasets.
Since the 2 datasets have a different folder structure, we will define 2 separate functions. 

In [None]:
target_sr = 16000

def load_UrbanSound8K(path: pathlib.Path) -> (pd.DataFrame, tf.data.Dataset):

    # Load the metadata
    metadata_path = path / "metadata/UrbanSound8K.csv"
    audio_path = path / "audio"
    metadata = pd.read_csv(metadata_path)

    # Get the number of classes
    num_classes = metadata['class'].nunique()

    # Initialize lists to hold file paths and labels
    file_paths = []
    labels = []

    # Shuffle the metadata entries to load samples in a random order
    shuffled_metadata = metadata.sample(frac=1, random_state=123).reset_index(drop=True)

    # Iterate over the metadata entries
    for _, row in shuffled_metadata.iterrows():
        # Construct the file path for the current entry
        fold_name = f'fold{row["fold"]}'
        file_name = row['slice_file_name']
        file_path = audio_path / fold_name / file_name

        # Append the file path and label to the lists
        file_paths.append(str(file_path))
        labels.append(row['classID'])

    # Convert lists to tensors
    file_paths_tensor = tf.convert_to_tensor(file_paths, dtype=tf.string)
    labels_tensor = tf.convert_to_tensor(labels, dtype=tf.int64)

    # One-hot encode the labels
    one_hot_labels_tensor = tf.one_hot(labels_tensor, num_classes)

    # Create a dataset from tensors
    dataset = tf.data.Dataset.from_tensor_slices((file_paths_tensor, one_hot_labels_tensor))

    def load_audio_librosa(file_path, label):
        # This function now expects a numpy array input for file_path and label
        audio, _ = librosa.load(file_path.numpy(), sr=target_sr, mono=True)
        return audio.astype(np.float32), np.array(label).astype(np.int64)
    
    # Wrap the load_audio_librosa function
    def load_audio_wrapped(file_path, label):
        [audio, label] = tf.py_function(load_audio_librosa, [file_path, label], [tf.float32, tf.int64])
        return audio, label

    # Map the wrapped load_audio function to the dataset
    dataset = dataset.map(load_audio_wrapped, num_parallel_calls=tf.data.AUTOTUNE)

    return metadata, dataset

In [None]:
DATASET_NAME = 'UrbanSound8K' # Choose between 'UrbanSound8K' and 'ESC-10'

path_dict = {
    'UrbanSound8K': pathlib.Path("UrbanSound8K"),
    'ESC-10': pathlib.Path("ESC-50-master")
}

path = path_dict[DATASET_NAME]

# Load the UrbanSound8K dataset
metadata, audio_dataset = load_UrbanSound8K(path)

# Construct class names array corresponding to the one-hot labels
unique_classes = metadata[['classID', 'class']].drop_duplicates().sort_values('classID')
class_names = unique_classes['class'].to_numpy()

# Display the class names
print(f"Class names: {class_names}")

Now we will pad the audio samples to the maximum length in the whole dataset. This is done to ensure that all samples have the same length, which is required by the neural network, which accepts only fixed-size inputs.


In [None]:
def pad_with_zeros(audio, label):
    max_length_seconds = (metadata['end'] - metadata['start']).max()
    max_length_samples = int(target_sr * max_length_seconds)

    current_length = tf.shape(audio)[0]
    padding_amount = max_length_samples - current_length

    # Use tf.cond to decide whether to pad or truncate
    padded_audio = tf.cond(
        padding_amount < 0,
        lambda: audio[:max_length_samples],  # Truncate the audio
        lambda: tf.pad(audio, paddings=[[0, padding_amount]], mode='CONSTANT', constant_values=0)  # Pad the audio
    )

    return padded_audio, label

# Map the pad_with_zeros function to the dataset
audio_dataset = audio_dataset.map(pad_with_zeros, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# Inspect the first 6 audio clips in the training set

for audio, label in audio_dataset.take(6):
    print(f"Audio shape: {audio.shape}")
    print(f"Label: {label}")
    print(f"Class: {class_names[np.argmax(label)]}")
    display(Audio(audio, rate=target_sr))
    plt.figure()
    plt.plot(audio)
    plt.show()

In [None]:
# #Function for adding noises

# #Load noise files
# pink_noise_file = './_background_noise_/pink_noise.wav'
# white_noise_file = './_background_noise_/white_noise.wav'

# pink_noise_audio, _ = librosa.load(pink_noise_file, sr=16000)
# white_noise_audio, _ = librosa.load(white_noise_file, sr=16000)

# # Convert to tensors
# white_noise_tensor = tf.convert_to_tensor(white_noise_audio, dtype=tf.float32)
# pink_noise_tensor = tf.convert_to_tensor(pink_noise_audio, dtype=tf.float32)

# # Add noise to audio
# def add_noises(audio_tensor, noise_types=['pink'], noise_probs=[1], noise_levels=[0.01]):
#     for noise_type, noise_prob, noise_level in zip(noise_types, noise_probs, noise_levels):
#         if random.random() < noise_prob:
#             if noise_type == 'white':
#                 noise_tensor = white_noise_tensor
#             elif noise_type == 'pink':
#                 noise_tensor = pink_noise_tensor

#             batch_size, audio_len = tf.shape(audio_tensor)[0], tf.shape(audio_tensor)[1]
#             noise_len = int(tf.shape(noise_tensor)[0])
#             start = tf.random.uniform((batch_size,), 0, noise_len - audio_len, dtype=tf.int32)
#             indices = tf.expand_dims(tf.range(batch_size), axis=1)
#             starts = tf.concat([indices, tf.expand_dims(start, axis=1)], axis=1)
#             noise = tf.map_fn(lambda x: tf.slice(noise_tensor, [x[1]], [audio_len]), starts, dtype=tf.float32)
#             audio_tensor = audio_tensor + noise_level * noise
#     return audio_tensor

In [None]:
# #Function for time shifting

# def time_shift(audio_tensor, shift_range=0.1):
#     batch_size, audio_len = tf.shape(audio_tensor)[0], tf.shape(audio_tensor)[1]
#     shift_amount = tf.cast(tf.cast(audio_len, tf.float32) * shift_range, tf.int32)
#     shift = tf.random.uniform((batch_size,), minval=-shift_amount, maxval=shift_amount, dtype=tf.int32)

#     def shift_audio(x):
#         audio, delta = x
#         paddings = tf.cond(
#             delta < 0,
#             lambda: ((-delta, 0),),
#             lambda: ((0, delta),)
#         )
#         return tf.pad(tf.slice(audio, [tf.math.maximum(0, delta)], [audio_len - tf.math.abs(delta)]), paddings)

#     shifted_audio = tf.map_fn(shift_audio, (audio_tensor, shift), dtype=tf.float32)
#     return shifted_audio

In [None]:
# #Function for pitch shifting

# def pitch_shift(audio_tensor, pitch_range=(0.3, 0.3), sample_rate=16000):
#     def shift_pitch(x):
#         audio = x.numpy()
#         factor = np.random.uniform(pitch_range[0], pitch_range[1])
#         n_bins = 12  # You can adjust this value based on your requirements
#         shifted_audio = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=factor, bins_per_octave=n_bins)
#         return tf.convert_to_tensor(shifted_audio, dtype=tf.float32)

#     return tf.py_function(shift_pitch, [audio_tensor], tf.float32)


In [None]:
# #Function for applying all the preprocessing steps
# def preprocess_audio(audio, label, noise_colors=['pink'], noise_probs=[1], noise_levels=[0.01],
#                      shift_range=0.1,
#                      pitch_range=(-1, 1)):
#     audio = time_shift(audio, shift_range)
#     audio = add_noises(audio, noise_colors, noise_probs, noise_levels)
#     audio = pitch_shift(audio, pitch_range)
#     return audio, label

In [None]:
# NOISE_COLORS = ['pink', 'white']
# NOISE_PROBS = [0.5, 0.5]
# NOISE_LEVELS = [0.01, 0.01]
# SHIFT_RANGE = 0.1
# PITCH_RANGE = (-1, 1)

# # Apply the preprocessing function to your train_ds and val_ds

# train_ds = train_ds.map(lambda audio, label: preprocess_audio(audio, label, NOISE_COLORS, NOISE_PROBS, NOISE_LEVELS, SHIFT_RANGE, PITCH_RANGE), tf.data.AUTOTUNE, deterministic=True)
# val_ds = val_ds.map(lambda audio, label: preprocess_audio(audio, label, NOISE_COLORS, NOISE_PROBS, NOISE_LEVELS, SHIFT_RANGE, PITCH_RANGE), tf.data.AUTOTUNE, deterministic=True)

In [None]:
#Spectrogram parameters
frame_length = 512
frame_step = 160
fft_length = 512

#MFCC parameters
num_mel_bins = 40
num_mfccs = 13

In [None]:
def compute_spectrogram(waveform):
  
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(waveform, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
  
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)

  # Add a channel dimension to the spectrogram. This is required for the Conv2D input layer, which expects a tensor of shape (batch_size, height, width, channels).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

def compute_mfcc(waveform):
    # First, compute the spectrogram of the input waveform
    spectrogram = tf.signal.stft(waveform, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)

    # Obtain the magnitude of the STFT
    spectrogram = tf.abs(spectrogram)
    
    # Compute the mel spectrogram
    mel_spectrogram = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=num_mel_bins,
        num_spectrogram_bins=tf.shape(spectrogram)[-1],
        sample_rate=target_sr,
        lower_edge_hertz=20.0,  # Typically 20 Hz is used for the lower edge
        upper_edge_hertz=target_sr / 2)  # Nyquist frequency

    mel_spectrogram = tf.tensordot(spectrogram, mel_spectrogram, 1)
    mel_spectrogram.set_shape(spectrogram.shape[:-1].concatenate(mel_spectrogram.shape[-1:]))

    # Compute the log mel spectrogram
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)

    # Compute MFCCs from log mel spectrograms
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)[..., :num_mfccs]

    # Add a channel dimension to the MFCCs
    mfccs = mfccs[..., tf.newaxis]

    return mfccs

In [None]:
# Define 3 types of normalization

def mean_std_normalize(image, label):
    # Compute mean and standard deviation
    mean = tf.reduce_mean(image)
    std = tf.math.reduce_std(image)
    # Standardize the image
    image = (image - mean) / (std + 1e-6)
    return image, label

def min_max_normalize(image, label):
    image = (image - tf.reduce_min(image)) / (tf.reduce_max(image) - tf.reduce_min(image))
    return image, label

def z_score_normalize(image, label):
    image = (image - tf.reduce_mean(image)) / tf.math.reduce_std(image)
    return image, label

def per_channel_mean_std_normalize(image, label):
    mean = tf.math.reduce_mean(image, axis=[0, 1])
    std = tf.math.reduce_std(image, axis=[0, 1])
    image = (image - mean) / std
    return image, label

In [None]:
# Transform the dataset based on chosen 2d representation and normalization
def transform_normalize_dataset(dataset, transform, normalization):
    if transform == 'spectrogram':
        dataset = dataset.map(lambda audio, label: (compute_spectrogram(audio), label), num_parallel_calls=tf.data.AUTOTUNE)
    elif transform == 'mfcc':
        dataset = dataset.map(lambda audio, label: (compute_mfcc(audio), label), num_parallel_calls=tf.data.AUTOTUNE)
    else:
        raise ValueError("Invalid transform value. Choose either 'spectrogram' or 'mfcc'.")

    if normalization == 'mean_std':
        dataset = dataset.map(mean_std_normalize, num_parallel_calls=tf.data.AUTOTUNE)
    elif normalization == 'min_max':
        dataset = dataset.map(min_max_normalize, num_parallel_calls=tf.data.AUTOTUNE)
    elif normalization == 'z_score':
        dataset = dataset.map(z_score_normalize, num_parallel_calls=tf.data.AUTOTUNE)
    elif normalization == 'per_channel_mean_std':
        dataset = dataset.map(per_channel_mean_std_normalize, num_parallel_calls=tf.data.AUTOTUNE)
    elif normalization == 'none':
        pass
    else:
        raise ValueError("Invalid normalization value. Choose either 'mean_std', 'min_max', 'z_score', 'per_channel_mean_std', or 'none'.")
    
    return dataset

In [None]:
AUDIO_TRANSFORM = 'mfcc' # Choose 'spectrogram' or 'mfcc'
NORMALIZATION = 'per_channel_mean_std' # Choose 'mean_std', 'min_max', 'z_score', 'per_channel_mean_std', or 'none'

dataset = transform_normalize_dataset(audio_dataset, AUDIO_TRANSFORM, NORMALIZATION)

In [None]:
# Plot the first 6 elements of the transformed dataset

plt.figure(figsize=(18, 15))
for i, (image, label) in enumerate(dataset.take(6)):

    ax = plt.subplot(3, 2, i + 1)
    data = tf.squeeze(image).numpy().T
    img = plt.imshow(data, aspect='auto', cmap='inferno', origin='lower')
    
    # Add title and labels
    plt.title(f"Class: {class_names[np.argmax(label)]}")
    plt.xlabel('Time')
    plt.ylabel('Frequency')
    
    # Add ticks
    ax.set_xticks(range(0, data.shape[1], data.shape[1] // 5))  # Adjust the range and step as needed
    ax.set_yticks(range(0, data.shape[0], data.shape[0] // 5))  # Adjust the range and step as needed
    
    # Show the colorbar
    plt.colorbar(img, ax=ax)
    
    # Turn on axis
    plt.axis('on')
plt.tight_layout()
plt.show()


In the next block of code, we split the data into training, validation, and test sets and we batch the data. Splitting coefficients and batch size are modifiable hyperparameters.

In [None]:
batch_size = 32
shuffle_buffer_size = len(metadata)

train_split = 0.8
validation_split = 0.1

num_samples = len(metadata)
num_train_samples = int(num_samples * train_split)
num_val_samples = int(num_samples * validation_split)

# Cache the dataset
dataset = dataset.cache()

# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

# Split the dataset into training, validation, and test sets
train_dataset = dataset.take(num_train_samples)
test_val_dataset = dataset.skip(num_train_samples)
val_dataset = test_val_dataset.take(num_val_samples)
test_dataset = test_val_dataset.skip(num_val_samples)

# Batch the datasets
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

# Fix the shape of the datasets (needed for accuracy metric to work during training, see https://github.com/tensorflow/tensorflow/issues/32912)
def _fixup_shape(images, labels):
    images.set_shape([None, None, None, 1]) # 2D images with 1 channel
    labels.set_shape([None, 10]) # 10 classes for UrbanSound8K
    return images, labels
train_dataset = train_dataset.map(_fixup_shape, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(_fixup_shape, num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.map(_fixup_shape, num_parallel_calls=tf.data.AUTOTUNE)

# Prefetch the datasets
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
num_classes = metadata['classID'].nunique()
input_shape = next(iter(train_dataset))[0].shape

In [None]:
DROPOUT_RATE = 0.2

num_classes = metadata['classID'].nunique()
print(f"Number of classes: {num_classes}")
print(f"Input shape: {input_shape}")

model = tf.keras.models.Sequential()

# Add the first convolutional layer
model.add(tf.keras.layers.Conv2D(32, (3, 2), activation='tanh', input_shape=input_shape[1:]))
model.add(tf.keras.layers.MaxPooling2D((9, 1)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(DROPOUT_RATE))

# Add the second convolutional layer
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='tanh'))
model.add(tf.keras.layers.MaxPooling2D((3, 1)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(DROPOUT_RATE))

# Flatten the output of the convolutional layer
model.add(tf.keras.layers.Flatten())

# Add the last dense layer
model.add(tf.keras.layers.Dense(num_classes))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(DROPOUT_RATE))
model.add(tf.keras.layers.Activation('softmax'))

# Display the model summary
model.summary()

In [None]:
model.compile(tf.keras.optimizers.legacy.Adam(learning_rate=0.005),
              loss='categorical_crossentropy',
                metrics=['accuracy'])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

earlystopping = callbacks.EarlyStopping(monitor="val_loss",
                                        mode="min", patience=5,
                                        restore_best_weights=True)

In [None]:
model.fit(train_dataset,
          validation_data=val_dataset,
          epochs=50,
          callbacks=[tensorboard_callback, earlystopping])

In [None]:
#Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_dataset)

In [None]:
SAVED_MODELS_DIR = "./saved_models/"


formatted_accuracy = "{:.2f}".format(test_accuracy * 100)
model_path = pathlib.Path(SAVED_MODELS_DIR) / DATASET_NAME / f"{AUDIO_TRANSFORM}-{NORMALIZATION}-{formatted_accuracy}"

# Save the model
model.save(model_path)

In [None]:
def list_and_load_model(saved_models_dir, dataset_name, audio_transform, normalization):
    # Construct the base directory path for the models
    base_model_dir = pathlib.Path(saved_models_dir) / dataset_name
    
    # Check if the base model directory exists
    if not base_model_dir.exists():
        print(f"No saved models found in {base_model_dir}")
        return None
    
    # List all subdirectories in the base model directory
    model_subdirs = [d for d in os.listdir(base_model_dir) if os.path.isdir(base_model_dir / d)]
    
    # Filter subdirectories based on the audio_transform and normalization
    filtered_model_subdirs = [d for d in model_subdirs if d.startswith(f"{audio_transform}-{normalization}")]
    
    # Sort the models by the accuracy embedded in the directory name
    filtered_model_subdirs.sort(key=lambda x: float(x.split('-')[-1]), reverse=True)
    
    # Print the available models with an index
    for idx, model_subdir in enumerate(filtered_model_subdirs):
        print(f"{idx}: {model_subdir}")
    
    # Ask the user to select a model to load
    selected_index = int(input("Enter the index of the model to load: "))
    selected_model_subdir = filtered_model_subdirs[selected_index]
    
    # Load the selected model
    full_model_path = base_model_dir / selected_model_subdir
    model = tf.keras.models.load_model(full_model_path)
    
    print(f"Loaded model from {full_model_path}")
    return model

In [None]:
# Load the model

model = list_and_load_model(SAVED_MODELS_DIR, 'UrbanSound8K', 'spectrogram', 'mean_std')

To end we can calculate the confusion matrix of our model and all the intermediate outputs of each layer.

In [None]:
#Confusion matrix function
def get_true_and_predicted_labels(model, dataset):
    y_pred = []
    y_true = []

    for image_batch, label_batch in dataset:
        y_true.append(tf.argmax(label_batch, axis=-1))  # Convert one-hot encoded labels to class indices
        preds = model.predict(image_batch, verbose=0)
        y_pred.append(np.argmax(preds, axis=-1))

    correct_labels = tf.concat([item for item in y_true], axis=0)
    predicted_labels = tf.concat([item for item in y_pred], axis=0)

    return correct_labels, predicted_labels

def print_confusion_matrix(model, val_spectrogram_ds):
    correct_labels, predicted_labels = get_true_and_predicted_labels(model, val_spectrogram_ds)
    confusion_mtx = tf.math.confusion_matrix(correct_labels, predicted_labels).numpy()

     # Optionally, you can use ConfusionMatrixDisplay from scikit-learn to visualize the confusion matrix
    
    fig, ax = plt.subplots(figsize=(10, 10))
    display = ConfusionMatrixDisplay(confusion_mtx, display_labels=class_names)

    display.plot(xticks_rotation='vertical', ax=ax)
    plt.show()


print_confusion_matrix(model, test_dataset)

In [None]:
#function to get the output of a layer

def get_layer_output(model, layer_index, test_dataset: tf.data.Dataset):

    # Create a new model with the specified layer's output
    layer_output_model = tf.keras.Model(inputs=model.inputs, outputs=model.layers[layer_index].output)

    # Pass the input_data to the new model to get the output of the specified layer
    example, label = test_dataset.rebatch(1).shuffle(len(test_dataset)).take(1).as_numpy_iterator().next()    

    #convert label to string
    label = class_names[np.argmax(label)]

    #get layer output
    layer_output = layer_output_model.predict(example)

    return layer_output, label

#functions to plot feature maps

show_colorbar = True

def plot_feature_maps(model, layer_index, test_dataset: tf.data.Dataset):

    #handle plotting of input
    if layer_index == -1:
        example, label = test_dataset.rebatch(1).shuffle(len(test_dataset)).take(1).as_numpy_iterator().next()
        label = class_names[np.argmax(label)]
        plot_conv_feature_maps(example, 1, layer_index, model, label)
        return

    feature_maps, label = get_layer_output(model, layer_index, test_dataset)
    
    # Check the dimensions of the layer output
    #if("CustomQuantizeLayer" in str(type(model.layers[layer_index]))):
    #    print("Specified layers is a CustomQuantizeLayer. Cannot plot feature maps.")
    #    return
    if len(feature_maps.shape) == 4:  # Conv2D or MaxPooling2D layers
        num_feature_maps = feature_maps.shape[-1]
        plot_conv_feature_maps(feature_maps, num_feature_maps, layer_index, model, label)
        
    elif len(feature_maps.shape) == 2:  # Dense layer
        plot_dense_feature_maps(feature_maps, layer_index, model, label)
        
    else:
        print(f"Layer {layer_index} has an unsupported output shape. Cannot plot feature maps.")

def plot_conv_feature_maps(feature_maps, num_feature_maps, layer_index, model, label, show_colorbar=show_colorbar):
    # Create a grid of subplots
    if num_feature_maps == 1:
        num_cols = 1
        num_rows = 1
        figsize = (10, 7)  # Rectangular dimensions for single-channel input spectrogram
    else:
        num_cols = 4
        num_rows = num_feature_maps // num_cols + (num_feature_maps % num_cols > 0)
        figsize = (15, 15)  # Square dimensions for multi-channel feature maps

    # Set up the figure
    fig, axes = plt.subplots(num_rows, num_cols, figsize=figsize)

    # Set title of the figure as the layer name and index
    if (layer_index != -1) & (layer_index != 0): fig.suptitle(f'Feature Maps of Layer {layer_index}: {model.layers[layer_index].name}\nOutput shape {feature_maps.shape}\nLabel: {label}')
    elif layer_index == -1 : fig.suptitle(f'Input Spectrogram\nLabel: {label}')
    elif layer_index == 0 : fig.suptitle(f'Quantized Input Spectrogram\nLabel: {label}')
    
    # Plot each feature map
    for i in range(num_rows):
        for j in range(num_cols):
            idx = i * num_cols + j
            if idx < num_feature_maps:
                if num_feature_maps == 1:
                    ax = axes
                else:
                    ax = axes[i, j]
                img = ax.imshow(feature_maps[0, :, :, idx], cmap='gray')
                if (layer_index != -1) & (layer_index != 0): ax.set_title(f'Feature Map {idx}')
                ax.axis('off')
                
                # Add a colorbar legend to the right of each image if show_colorbar is True
                if show_colorbar:
                    cbar = fig.colorbar(img, ax=ax)
                    cbar.ax.tick_params(labelsize=8)

    # Display the plot
    plt.show()



def plot_dense_feature_maps(feature_maps, layer_index, model, label):
    # Reshape the feature maps to a 1D array
    reshaped_feature_maps = np.reshape(feature_maps, (-1,))

    # Set up the figure
    fig, ax = plt.subplots(figsize=(15, 5))

    # Set title of the figure as the layer name and index
    fig.suptitle(f'Feature Maps of Layer {layer_index}: {model.layers[layer_index].name}\nOutput shape {feature_maps.shape}\nLabel: {label}')

    # Plot the feature maps as a bar plot
    ax.bar(range(len(reshaped_feature_maps)), reshaped_feature_maps)
    ax.set_xlabel('Feature Map Index')
    ax.set_ylabel('Value')
    #set x ticks to be label names if layer is last one 
    if(layer_index == len(model.layers) - 1):
        ax.set_xticks(np.arange(len(reshaped_feature_maps)), class_names)
        plt.xticks(rotation=90)

    # Display the plot
    plt.show()

To view the output of a specific layer we select a random (but constant between executions) sample by setting a seed.

To select a different layer, change the layer index in the last line of code (-1 shows the input sample).

In [None]:
tf.keras.utils.set_random_seed(399)
plot_feature_maps(model, 12, test_dataset)

In [None]:
#Function to print all layers activations

def print_layers_activations(model: tf.keras.Sequential, test_ds: tf.data.Dataset, mode, index=None):
    
    if mode == "all":
        for i in range(model.layers.__len__()):
            print(f"Layer {i}: {model.layers[i].name}")
            feature_maps, label = get_layer_output(model, i, test_ds)
            print(feature_maps)
            print(feature_maps.shape)
            print("\n")
    elif mode == "single":
        if index == -1:
            example, label = test_ds.rebatch(1).shuffle(len(test_ds)).take(1).as_numpy_iterator().next() 
            print("Spectrogram with label " + class_names[np.argmax(label)])
            print("\n")
            print(example)
            print(example.shape)
            return
        print(f"Layer {index}: {model.layers[index].name}")
        feature_maps, label = get_layer_output(model, index, test_ds)
        np.set_printoptions(threshold=np.inf)
        print(feature_maps)
        print(feature_maps.shape)
        print("\n")
    print(class_names[np.argmax(label)])

#Print all layers activations
print_layers_activations(model, test_dataset, "all")