In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models

def build_cnn_model(input_shape, num_classes):
    """
    Builds a lightweight CNN model for audio classification.

    Args:
        input_shape (tuple): The shape of the input spectrograms (height, width, channels).
        num_classes (int): The number of output classes (e.g., 2 for melody vs. other).

    Returns:
        A TensorFlow Keras model.
    """
    model = models.Sequential([
        # Input Layer
        layers.Input(shape=input_shape),

        # First Convolutional Block
        # Using smaller filters (3x3) and fewer of them (8) to keep the model small.
        layers.Conv2D(8, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        # Second Convolutional Block
        layers.Conv2D(16, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        # Third Convolutional Block
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        # Flatten the feature map to feed into the dense layers
        layers.Flatten(),

        # Dense Layer for classification
        # A smaller dense layer (32 units) to reduce parameters
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.5), # Dropout helps prevent overfitting

        # Output Layer
        # The number of units equals the number of classes.
        # Use 'softmax' for multi-class or 'sigmoid' for binary classification.
        layers.Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')
    ])

    return model

# Example of how to create the model
# These values will be determined during data preprocessing
INPUT_SHAPE = (128, 128, 1) # (n_mels, time_steps, channels)
NUM_CLASSES = 2 # (lg_melody, other_sounds)

model = build_cnn_model(input_shape=INPUT_SHAPE, num_classes=NUM_CLASSES)
model.summary()


2025-07-30 00:33:37.042408: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-30 00:33:37.047153: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-30 00:33:37.059174: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753850017.081605   99829 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753850017.087669   99829 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753850017.104003   99829 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [4]:
tf.keras.utils.get_file('esc-50.zip',
                        'https://github.com/karoldvl/ESC-50/archive/master.zip',
                        cache_dir='./',
                        cache_subdir='data/other_sounds',
                        extract=True)

Downloading data from https://github.com/karoldvl/ESC-50/archive/master.zip
645701632/Unknown [1m40s[0m 0us/step

'./data/other_sounds/esc-50_extracted'

In [13]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split

# --- Configuration ---
DATA_PATH = "data/"


# --- Configuration ---
CONFIG = {
    "sample_rate": 16000,    # Hz
    "window_duration": 1.5,  # seconds (length of one spectrogram)
    "hop_duration": 0.5,     # seconds (how much to slide the window)
    "n_mels": 64,            # Number of Mel bands (reduced for ESP32 efficiency)
    "n_fft": 1024,           # Number of FFT points
    "max_spectrogram_width": 48 # Fixed width for spectrograms (time steps)
}


def process_audio_file(audio_path, class_label, config):
    """
    Loads an audio file and converts it into one or more Mel spectrograms.

    - If class_label is 'lg_melody', it uses a sliding window to generate
      multiple, overlapping spectrograms from the entire clip.
    - Otherwise, it generates a single spectrogram from the start of the clip.

    Args:
        audio_path (str): Path to the audio file.
        class_label (str): The name of the class (e.g., 'lg_melody').
        config (dict): A dictionary of processing parameters.

    Returns:
        A list of spectrograms. Returns an empty list if processing fails.
    """
    try:
        y, sr = librosa.load(audio_path, sr=config["sample_rate"])
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return []

    spectrograms = []
    window_samples = int(config["window_duration"] * config["sample_rate"])
    hop_samples = int(config["hop_duration"] * config["sample_rate"])

    if class_label == 'lg_melody':
        # --- Sliding Window for the Target Melody ---
        for start in range(0, len(y) - window_samples, hop_samples):
            end = start + window_samples
            chunk = y[start:end]
            
            # Generate Mel spectrogram for the chunk
            spectrogram = librosa.feature.melspectrogram(
                y=chunk, 
                sr=sr, 
                n_mels=config["n_mels"], 
                n_fft=config["n_fft"]
            )
            log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            
            # Standardize spectrogram width
            if log_spectrogram.shape[1] > config["max_spectrogram_width"]:
                log_spectrogram = log_spectrogram[:, :config["max_spectrogram_width"]]
            else:
                pad_width = config["max_spectrogram_width"] - log_spectrogram.shape[1]
                log_spectrogram = np.pad(log_spectrogram, ((0, 0), (0, pad_width)), mode='constant')
            
            spectrograms.append(log_spectrogram)
    
    else:
        # --- Single Slice for Other Sounds ---
        # Truncate or pad the audio to the window duration
        if len(y) > window_samples:
            y = y[:window_samples]
        else:
            y = np.pad(y, (0, window_samples - len(y)), 'constant')

        # Generate a single Mel spectrogram
        spectrogram = librosa.feature.melspectrogram(
            y=y, 
            sr=config["sample_rate"], 
            n_mels=config["n_mels"], 
            n_fft=config["n_fft"]
        )
        log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        
        # Standardize spectrogram width (same logic as above)
        if log_spectrogram.shape[1] > config["max_spectrogram_width"]:
            log_spectrogram = log_spectrogram[:, :config["max_spectrogram_width"]]
        else:
            pad_width = config["max_spectrogram_width"] - log_spectrogram.shape[1]
            log_spectrogram = np.pad(log_spectrogram, ((0, 0), (0, pad_width)), mode='constant')
            
        spectrograms.append(log_spectrogram)
        
    return spectrograms

def load_data(data_path):
    """Loads all audio files, converts them, and creates labels."""
    X, y = [], []
    class_map = {label: i for i, label in enumerate(os.listdir(data_path))}
    
    for label, class_idx in class_map.items():
        class_dir = os.path.join(data_path, label)
        for filename in os.listdir(class_dir):
            if filename.endswith(".wav"):
                filepath = os.path.join(class_dir, filename)
                spectrogram_list = process_audio_file(filepath, label, CONFIG)
                
                if spectrogram_list:
                    # Add all spectrograms from the list to our dataset
                    X.extend(spectrogram_list)
                    # Add a label for each spectrogram that was generated
                    y.extend([class_idx] * len(spectrogram_list))
                    
    return np.array(X), np.array(y), class_map

# --- Execute Data Preparation ---
X, y, class_map = load_data(DATA_PATH)
print("Class mapping:", class_map)

# Add a channel dimension for the CNN
X = X[..., np.newaxis]

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.1, random_state=42, stratify=None)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

Class mapping: {'other_sounds': 0, 'lg_melody': 1}
Training set shape: (1847, 64, 48, 1)
Validation set shape: (103, 64, 48, 1)
Test set shape: (103, 64, 48, 1)


In [15]:
# --- Compile the Model ---
model = build_cnn_model(input_shape=X_train.shape[1:], num_classes=len(class_map))

# Using Adam optimizer and SparseCategoricalCrossentropy because our labels are integers.
# If you one-hot encode your labels, use CategoricalCrossentropy.
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# --- Train the Model ---
EPOCHS = 30
BATCH_SIZE = 32

history = model.fit(X, y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X, y),
                    callbacks=[
                        # Stop training early if validation loss stops improving
                        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
                    ])

# --- Evaluate on Test Set ---
test_loss, test_acc = model.evaluate(X, y, verbose=2)
print(f'\nTest accuracy: {test_acc:.2f}')

Epoch 1/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.9635 - loss: 0.3276 - val_accuracy: 0.9742 - val_loss: 0.1145
Epoch 2/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9742 - loss: 0.0773 - val_accuracy: 0.9742 - val_loss: 0.0289
Epoch 3/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9742 - loss: 0.0371 - val_accuracy: 0.9742 - val_loss: 0.0268
Epoch 4/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9742 - loss: 0.0445 - val_accuracy: 0.9742 - val_loss: 0.0228
Epoch 5/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9742 - loss: 0.0347 - val_accuracy: 0.9742 - val_loss: 0.0218
Epoch 6/30
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9742 - loss: 0.0319 - val_accuracy: 0.9742 - val_loss: 0.0208
Epoch 7/30
[1m65/65[0m [32m━━━━

In [None]:
# Create a representative dataset for quantization
def representative_dataset():
    for i in range(100): # Use a subset of the training data
      # Ensure the data type is float32
      yield [X_train[i:i+1].astype(np.float32)]

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset
# Ensure that if the ops are not supported by the TFLite runtime, the converter throws an error.
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
# Set the input and output tensors to uint8 (or int8)
#converter.inference_input_type = tf.uint8 # or tf.int8
#converter.inference_output_type = tf.uint8 # or tf.int8

tflite_quant_model = converter.convert()

# --- Save the Model ---
with open('lg_sound_model.tflite', 'wb') as f:
    f.write(tflite_quant_model)

print("Quantized TFLite model saved as lg_sound_model.tflite")

INFO:tensorflow:Assets written to: /tmp/tmp31_i1bhf/assets


INFO:tensorflow:Assets written to: /tmp/tmp31_i1bhf/assets


Saved artifact at '/tmp/tmp31_i1bhf'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 64, 48, 1), dtype=tf.float32, name='keras_tensor_33')
Output Type:
  TensorSpec(shape=(None, 2), dtype=tf.float32, name=None)
Captures:
  130746471511248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471508560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471504720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471508944: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471498768: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471512592: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471498000: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471508368: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471511440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  130746471505296: TensorSpec(shape=(), dtype=tf.resource, name=None)




Quantized TFLite model saved as lg_sound_model.tflite


W0000 00:00:1753853144.620440   99829 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1753853144.620474   99829 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-07-30 01:25:44.620841: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmp31_i1bhf
2025-07-30 01:25:44.621559: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-07-30 01:25:44.621571: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmp31_i1bhf
I0000 00:00:1753853144.627308   99829 mlir_graph_optimization_pass.cc:425] MLIR V1 optimization pass is not enabled
2025-07-30 01:25:44.628464: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-07-30 01:25:44.666378: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmp31_i1bhf
2025-07-30 01:25:44.678584: I tensorflow/cc/saved_model/loader.cc:471] SavedModel 

In [14]:
y[-10:]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])