In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

DATA_DIR = "../data/frog_clips"
OUTPUT_MODEL = "../model/vgg_frog_model.pth"
LABEL_MAP_FILE = "../model/label_mapping.json"

TARGET_SR = 22050
N_MELS = 128
DURATION = 5  # seconds
SAMPLES = TARGET_SR * DURATION

#### Dataset Class

In [2]:
class FrogAudioDataset(Dataset):
    def __init__(self):
        self.paths = []
        self.labels = []

        label_map = {"CONTROL": 0, "TOAD-WEST": 1}

        for cls in ["CONTROL", "TOAD-WEST"]:
            folder = os.path.join(DATA_DIR, cls)
            for f in os.listdir(folder):
                self.paths.append(os.path.join(folder, f))
                self.labels.append(label_map[cls])

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        y, sr = librosa.load(path, sr=TARGET_SR)

        if len(y) < SAMPLES:
            y = np.pad(y, (0, SAMPLES - len(y)))
        else:
            y = y[:SAMPLES]

        # Convert audio to Mel spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
        S_dB = librosa.power_to_db(S, ref=np.max)
        S_dB = np.expand_dims(S_dB, axis=0)

        return torch.tensor(S_dB, dtype=torch.float32), self.labels[idx]

#### Model Definition (VGG-like CNN)

In [3]:
class VGGSmall(nn.Module):
    def __init__(self):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        )

        # NEW — fixes mat1/mat2 issue
        self.pool = nn.AdaptiveAvgPool2d((4, 4))

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 4 * 4, 128),
            nn.ReLU(),
            nn.Linear(128, 2)  # binary classification
        )

    def forward(self, x):
        x = self.features(x)
        x = self.pool(x)   # ALWAYS outputs [batch, 64, 4, 4]
        x = self.classifier(x)
        return x

#### Training

In [4]:
dataset = FrogAudioDataset()
loader = DataLoader(dataset, batch_size=8, shuffle=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = VGGSmall().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

EPOCHS = 20

for epoch in range(EPOCHS):
    total_loss = 0
    correct = 0
    for x, y in tqdm(loader):
        x, y = x.to(device), torch.tensor(y).to(device)

        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (pred.argmax(1) == y).sum().item()

    print(f"Epoch {epoch+1}/{EPOCHS}  Loss: {total_loss:.4f}  Acc: {correct/len(dataset):.4f}")

  x, y = x.to(device), torch.tensor(y).to(device)
100%|██████████| 59/59 [00:22<00:00,  2.61it/s]


Epoch 1/20  Loss: 18.9338  Acc: 0.8955


100%|██████████| 59/59 [00:08<00:00,  7.35it/s]


Epoch 2/20  Loss: 15.1286  Acc: 0.8977


100%|██████████| 59/59 [00:08<00:00,  7.22it/s]


Epoch 3/20  Loss: 13.5104  Acc: 0.9019


100%|██████████| 59/59 [00:08<00:00,  7.15it/s]


Epoch 4/20  Loss: 10.5800  Acc: 0.9275


100%|██████████| 59/59 [00:08<00:00,  7.11it/s]


Epoch 5/20  Loss: 8.8608  Acc: 0.9360


100%|██████████| 59/59 [00:08<00:00,  7.13it/s]


Epoch 6/20  Loss: 7.2187  Acc: 0.9595


100%|██████████| 59/59 [00:08<00:00,  7.35it/s]


Epoch 7/20  Loss: 4.9764  Acc: 0.9680


100%|██████████| 59/59 [00:08<00:00,  7.35it/s]


Epoch 8/20  Loss: 4.5615  Acc: 0.9765


100%|██████████| 59/59 [00:08<00:00,  7.30it/s]


Epoch 9/20  Loss: 4.3955  Acc: 0.9701


100%|██████████| 59/59 [00:08<00:00,  7.29it/s]


Epoch 10/20  Loss: 3.8366  Acc: 0.9829


100%|██████████| 59/59 [00:08<00:00,  6.89it/s]


Epoch 11/20  Loss: 3.5674  Acc: 0.9808


100%|██████████| 59/59 [00:07<00:00,  7.39it/s]


Epoch 12/20  Loss: 3.3320  Acc: 0.9765


100%|██████████| 59/59 [00:08<00:00,  6.97it/s]


Epoch 13/20  Loss: 3.3165  Acc: 0.9829


100%|██████████| 59/59 [00:08<00:00,  6.96it/s]


Epoch 14/20  Loss: 3.0375  Acc: 0.9787


100%|██████████| 59/59 [00:08<00:00,  7.09it/s]


Epoch 15/20  Loss: 3.4106  Acc: 0.9744


100%|██████████| 59/59 [00:08<00:00,  7.31it/s]


Epoch 16/20  Loss: 2.7578  Acc: 0.9808


100%|██████████| 59/59 [00:07<00:00,  7.45it/s]


Epoch 17/20  Loss: 2.7738  Acc: 0.9765


100%|██████████| 59/59 [00:07<00:00,  7.40it/s]


Epoch 18/20  Loss: 2.6368  Acc: 0.9851


100%|██████████| 59/59 [00:07<00:00,  7.42it/s]


Epoch 19/20  Loss: 2.6705  Acc: 0.9893


100%|██████████| 59/59 [00:08<00:00,  7.33it/s]

Epoch 20/20  Loss: 2.4498  Acc: 0.9808





#### Save the Model

In [5]:
torch.save(model.state_dict(), OUTPUT_MODEL)
print("Saved:", OUTPUT_MODEL)

Saved: ../model/vgg_frog_model.pth


In [6]:
import json
torch.save(model.state_dict(), "../model/vgg_frog_model.pth")

label_map = {
    "0": "no_frog",
    "1": "boreal_toad",
    "2": "boreal_chorus_frog"
}
with open("../model/label_mapping.json", "w") as f:
    json.dump(label_map, f, indent=4)