In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import json

# ======== PATHS ========
# Adjusted to your real structure
DATA_DIR = "../data/frog_clips"

OUTPUT_MODEL = "../model/vgg_frog_model.pth"
LABEL_MAP_FILE = "../model/label_mapping.json"

# ======== LABEL DEFINITIONS (2 CLASSES ONLY) ========
# IMPORTANT: You only have CONTROL and TOAD-WEST
class_to_idx = {
    "CONTROL": 0,
    "TOAD-WEST": 1
}

label_map = {
    "0": "no_frog",
    "1": "boreal_toad"
}

# ======== AUDIO SETTINGS ========
TARGET_SR = 22050
N_MELS = 128
FIXED_FRAMES = 128   # Force mel spectrogram width to 128
CLIP_SECONDS = 5     # Force audio length for training = 5 seconds

num_classes = len(class_to_idx)

#### Dataset Class

In [2]:
class FrogAudioDataset(Dataset):
    def __init__(self, root_dir=DATA_DIR):
        self.samples = []

        for folder, label in class_to_idx.items():
            fullpath = os.path.join(root_dir, folder)
            if not os.path.isdir(fullpath):
                print(f"WARNING: Folder missing → {fullpath}")
                continue
            
            for fn in os.listdir(fullpath):
                if fn.lower().endswith(".wav"):
                    self.samples.append((os.path.join(fullpath, fn), label))

        print(f"Total training samples: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]

        # Load and resample
        y, sr = librosa.load(path, sr=TARGET_SR)

        # Fix duration to CLIP_SECONDS
        required = TARGET_SR * CLIP_SECONDS
        if len(y) < required:
            y = np.pad(y, (0, required - len(y)))
        else:
            y = y[:required]

        # Compute mel spectrogram
        S = librosa.feature.melspectrogram(
            y=y, sr=TARGET_SR, n_mels=N_MELS
        )

        # === FIX: Old librosa padding ===
        current_frames = S.shape[1]

        if current_frames < FIXED_FRAMES:
            pad_width = FIXED_FRAMES - current_frames
            S = np.pad(S, ((0, 0), (0, pad_width)), mode="constant")
        else:
            S = S[:, :FIXED_FRAMES]

        # Convert to dB
        S_db = librosa.power_to_db(S, ref=np.max)

        # Convert to tensor: shape (1, 128, 128)
        x = torch.tensor(S_db, dtype=torch.float32).unsqueeze(0)
        y = torch.tensor(label, dtype=torch.long)

        return x, y

#### Model Definition (VGG-like CNN)

In [3]:
class VGGSmall(nn.Module):
    def __init__(self, num_classes=num_classes):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 128 → 64

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 64 → 32

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 32 → 16
        )

        # Final shape: (64, 16, 16) → 16384 features
        self.classifier = nn.Sequential(
            nn.Linear(64 * 16 * 16, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)  # ONLY 2 CLASSES
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)

#### Training

In [4]:
dataset = FrogAudioDataset()
loader = DataLoader(dataset, batch_size=8, shuffle=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Training on:", device)

model = VGGSmall().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    total_correct = 0
    total_samples = 0

    for x, y in tqdm(loader):
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total_correct += (pred.argmax(1) == y).sum().item()
        total_samples += y.size(0)

    acc = total_correct / total_samples
    print(f"Epoch {epoch+1}/{EPOCHS}  Loss={running_loss:.4f}  Acc={acc:.4f}")

Total training samples: 469
Training on: cpu


100%|██████████| 59/59 [00:10<00:00,  5.72it/s]


Epoch 1/20  Loss=20.3205  Acc=0.8913


100%|██████████| 59/59 [00:07<00:00,  7.64it/s]


Epoch 2/20  Loss=12.9867  Acc=0.9126


100%|██████████| 59/59 [00:08<00:00,  7.29it/s]


Epoch 3/20  Loss=8.2675  Acc=0.9488


100%|██████████| 59/59 [00:07<00:00,  7.41it/s]


Epoch 4/20  Loss=5.5677  Acc=0.9638


100%|██████████| 59/59 [00:08<00:00,  7.11it/s]


Epoch 5/20  Loss=4.5137  Acc=0.9723


100%|██████████| 59/59 [00:08<00:00,  7.28it/s]


Epoch 6/20  Loss=3.6975  Acc=0.9765


100%|██████████| 59/59 [00:08<00:00,  7.31it/s]


Epoch 7/20  Loss=2.5880  Acc=0.9893


100%|██████████| 59/59 [00:08<00:00,  7.32it/s]


Epoch 8/20  Loss=2.4554  Acc=0.9851


100%|██████████| 59/59 [00:08<00:00,  7.29it/s]


Epoch 9/20  Loss=1.6747  Acc=0.9893


100%|██████████| 59/59 [00:08<00:00,  7.27it/s]


Epoch 10/20  Loss=1.1627  Acc=0.9957


100%|██████████| 59/59 [00:07<00:00,  7.48it/s]


Epoch 11/20  Loss=1.0209  Acc=0.9936


100%|██████████| 59/59 [00:08<00:00,  7.24it/s]


Epoch 12/20  Loss=0.6605  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  7.22it/s]


Epoch 13/20  Loss=0.3902  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  7.11it/s]


Epoch 14/20  Loss=0.6302  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  7.21it/s]


Epoch 15/20  Loss=0.2023  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  6.88it/s]


Epoch 16/20  Loss=0.1263  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  6.98it/s]


Epoch 17/20  Loss=0.1318  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  7.04it/s]


Epoch 18/20  Loss=0.1286  Acc=1.0000


100%|██████████| 59/59 [00:08<00:00,  6.74it/s]


Epoch 19/20  Loss=0.0870  Acc=1.0000


100%|██████████| 59/59 [00:09<00:00,  6.53it/s]

Epoch 20/20  Loss=0.0796  Acc=1.0000





#### Save the Model

In [5]:
# Save model
torch.save(model.state_dict(), OUTPUT_MODEL)
print("Saved model to:", OUTPUT_MODEL)

# Save label mapping JSON
with open(LABEL_MAP_FILE, "w") as f:
    json.dump(label_map, f, indent=4)

print("Saved label mapping to:", LABEL_MAP_FILE)

Saved model to: ../model/vgg_frog_model.pth
Saved label mapping to: ../model/label_mapping.json
