# Module 10 — Audio Features & Classification (Expanded)

This notebook covers a practical classroom-ready audio workflow:

- synthesize a tiny audio dataset (two classes: sine vs noise/chirp)
- compute mel-spectrograms using `librosa`
- build a `tf.data` pipeline that yields spectrogram images
- train a small CNN on spectrograms (few epochs for demo)
- evaluate, visualize predictions, and save the model

Notes: If you have real audio files, replace the synthetic generation step with uploads or Drive-mounted paths.

## 1 — Setup (install packages and imports)

In [None]:
!pip -q install -U librosa matplotlib numpy tensorflow --quiet

import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
print('librosa version:', librosa.__version__)
print('TF version:', tf.__version__)

os.makedirs('/mnt/data/audio_dataset', exist_ok=True)
os.makedirs('/mnt/data/audio_dataset/class_sine', exist_ok=True)
os.makedirs('/mnt/data/audio_dataset/class_noise', exist_ok=True)


## 2 — Synthesize tiny audio dataset (sine wave vs noise/chirp)

In [None]:
import soundfile as sf
sr = 16000
DURATION = 1.0  # seconds

def make_sine(freq, path):
    t = np.linspace(0, DURATION, int(sr*DURATION), endpoint=False)
    y = 0.5 * np.sin(2*np.pi*freq*t)
    sf.write(path, y, sr)

def make_chirp(path):
    t = np.linspace(0, DURATION, int(sr*DURATION), endpoint=False)
    y = 0.5 * librosa.chirp(fmin=300, fmax=3000, sr=sr, length=len(t))
    sf.write(path, y, sr)

def make_noise(path):
    y = 0.5 * np.random.randn(int(sr*DURATION))
    sf.write(path, y, sr)

# create files
for i in range(40):
    make_sine(400 + i*10, f'/mnt/data/audio_dataset/class_sine/sine_{i}.wav')
for i in range(40):
    if i%2==0:
        make_chirp(f'/mnt/data/audio_dataset/class_noise/noise_{i}.wav')
    else:
        make_noise(f'/mnt/data/audio_dataset/class_noise/noise_{i}.wav')

print('Created synthetic audio files:', len(os.listdir('/mnt/data/audio_dataset/class_sine')), len(os.listdir('/mnt/data/audio_dataset/class_noise')))

## 3 — Compute Mel-spectrograms and visualize examples

In [None]:
def wav_to_mel(path, sr=16000, n_mels=64, n_fft=1024, hop_length=256):
    y, _ = librosa.load(path, sr=sr)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    S_db = librosa.power_to_db(S, ref=np.max)
    return S_db

# visualize a few
paths = [f'/mnt/data/audio_dataset/class_sine/sine_0.wav', f'/mnt/data/audio_dataset/class_noise/noise_1.wav']
plt.figure(figsize=(10,4))
for i, p in enumerate(paths):
    S_db = wav_to_mel(p)
    plt.subplot(1,2,i+1)
    librosa.display.specshow(S_db, sr=sr, hop_length=256, x_axis='time', y_axis='mel')
    plt.title(os.path.basename(p))
plt.show()


## 4 — Build `tf.data` pipeline (on-the-fly mel computation)

In [None]:
import glob

# collect file paths and labels
sine_files = sorted(glob.glob('/mnt/data/audio_dataset/class_sine/*.wav'))
noise_files = sorted(glob.glob('/mnt/data/audio_dataset/class_noise/*.wav'))
files = sine_files + noise_files
labels = [0]*len(sine_files) + [1]*len(noise_files)

# simple train/val split
from sklearn.model_selection import train_test_split
train_files, val_files, train_labels, val_labels = train_test_split(files, labels, test_size=0.2, random_state=42, stratify=labels)

def load_mel(path, label):
    path = path.decode('utf-8')
    S_db = wav_to_mel(path)
    # resize or pad to fixed shape (n_mels x time_bins). We'll ensure time_bins = 63 (approx)
    # For simplicity, take S_db and pad/crop to (64, 63)
    target_shape = (64, 63)
    S = S_db
    H, W = S.shape
    # normalize to 0-1
    S = (S - S.min()) / (S.max() - S.min() + 1e-6)
    # pad or crop width
    if W < target_shape[1]:
        pad_width = target_shape[1] - W
        S = np.pad(S, ((0,0),(0,pad_width)), mode='constant')
    else:
        S = S[:, :target_shape[1]]
    S = S.astype('float32')
    S = np.expand_dims(S, -1)  # channel
    return S, np.int64(label)

import tensorflow as tf

def tf_load_mel(path, label):
    S, lab = tf.py_function(load_mel, [path, label], [tf.float32, tf.int64])
    S.set_shape([64,63,1])
    lab.set_shape([])
    return S, lab

BATCH = 16
train_ds = tf.data.Dataset.from_tensor_slices((train_files, train_labels))
train_ds = train_ds.shuffle(100).map(tf_load_mel, num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((val_files, val_labels)).map(tf_load_mel, num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH).prefetch(tf.data.AUTOTUNE)

# inspect
for X, y in train_ds.take(1):
    print('Batch X shape:', X.shape, 'y shape:', y.shape)
    plt.figure(figsize=(6,3))
    plt.imshow(X[0,:,:,0], aspect='auto')
    plt.title('Sample mel (normalized)')
    plt.colorbar()
    plt.show()


## 5 — Build small CNN for spectrogram classification (Keras)

In [None]:
from tensorflow.keras import layers, models

def build_spec_cnn(input_shape=(64,63,1), num_classes=2):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(16,3,activation='relu', padding='same'), layers.MaxPooling2D((2,2)),
        layers.Conv2D(32,3,activation='relu', padding='same'), layers.MaxPooling2D((2,2)),
        layers.Conv2D(64,3,activation='relu', padding='same'), layers.GlobalAveragePooling2D(),
        layers.Dense(64, activation='relu'), layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_spec_cnn()
model.summary()


## 6 — Train the CNN (few epochs demo)

In [None]:
EPOCHS = 8
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.plot(history.history['loss'], label='train_loss'); plt.plot(history.history['val_loss'], label='val_loss'); plt.legend(); plt.title('Loss')
plt.subplot(1,2,2); plt.plot(history.history['accuracy'], label='train_acc'); plt.plot(history.history['val_accuracy'], label='val_acc'); plt.legend(); plt.title('Accuracy')
plt.show()


## 7 — Evaluate and visualize predictions

In [None]:
loss, acc = model.evaluate(val_ds)
print('Val loss, acc:', loss, acc)

# show some predictions
for Xb, yb in val_ds.take(1):
    preds = model.predict(Xb)
    pred_labels = np.argmax(preds, axis=1)
    plt.figure(figsize=(12,4))
    for i in range(min(8, Xb.shape[0])):
        plt.subplot(2,8,i+1); plt.imshow(Xb[i,:,:,0], aspect='auto'); plt.axis('off'); plt.title(f'T:{yb[i].numpy()} P:{pred_labels[i]}')
    plt.suptitle('Validation batch: True vs Predicted')
    plt.show()
    break


## 8 — Save model and tips

In [None]:
model.save('/mnt/data/audio_spec_cnn.h5')
print('Saved model to /mnt/data/audio_spec_cnn.h5')

# Tips: Replace synthetic audio with real dataset (Google Speech Commands, ESC-50, UrbanSound8K).
# Use mel-spectrogram augmentation (time-shift, freq masking) for robustness (SpecAugment).


## 9 — Exercises & Instructor Notes

- Replace synthetic data with a small subset of Speech Commands or ESC-50 for real audio classification.
- Implement SpecAugment (time/frequency masking) in the tf.data pipeline and observe robustness gains.
- Experiment with 1D CNNs on raw waveform vs 2D CNN on spectrograms and compare performance.
