# Data Mining Final Project
## Identifying Bird Species from Audio Spectrograms using ML

text

In [None]:
# ============================================
# Imports
# ============================================
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import librosa

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ============================================
# Config
# ============================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

METADATA_PATH = "train_metadata.csv"
TRAIN_AUDIO_DIR = "train_audio"

SAMPLE_RATE = 32000
CLIP_SECONDS = 5
NUM_SAMPLES = SAMPLE_RATE * CLIP_SECONDS
N_MELS = 64

BATCH_SIZE = 32
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3
TOP_K = 20

# ============================================
# Audio helpers
# ============================================
def load_audio(path):
    audio, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
    return audio, sr

def random_crop_or_pad(audio):
    if len(audio) > NUM_SAMPLES:
        start = random.randint(0, len(audio) - NUM_SAMPLES)
        audio = audio[start:start + NUM_SAMPLES]
    elif len(audio) < NUM_SAMPLES:
        pad = NUM_SAMPLES - len(audio)
        audio = np.pad(audio, (0, pad))
    return audio

def audio_to_mel_spectrogram(audio):
    mel = librosa.feature.melspectrogram(
        y=audio,
        sr=SAMPLE_RATE,
        n_mels=N_MELS,
        n_fft=2048,
        hop_length=512
    )
    mel = librosa.power_to_db(mel, ref=np.max)
    return mel

# ============================================
# Load + filter metadata
# ============================================
metadata = pd.read_csv(METADATA_PATH)
metadata = metadata.dropna(subset=["primary_label", "filename"])

species_counts = metadata["primary_label"].value_counts()
top_species = species_counts.head(TOP_K).index

metadata = metadata[metadata["primary_label"].isin(top_species)]
metadata = metadata.reset_index(drop=True)

# ============================================
# Label mapping (AFTER filtering)
# ============================================
unique_labels = sorted(metadata["primary_label"].unique())
label_to_index = {label: i for i, label in enumerate(unique_labels)}
index_to_label = {i: label for label, i in label_to_index.items()}

metadata["label_idx"] = metadata["primary_label"].map(label_to_index)
NUM_CLASSES = len(unique_labels)

print("Number of classes:", NUM_CLASSES)

# ============================================
# Train / validation split
# ============================================
train_df, val_df = train_test_split(
    metadata,
    test_size=0.2,
    stratify=metadata["label_idx"],
    random_state=SEED
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# ============================================
# Dataset
# ============================================
class BirdClefDataset(Dataset):
    def __init__(self, df, audio_dir, training=True):
        self.df = df
        self.audio_dir = audio_dir
        self.training = training

        self.freq_mask = torchaudio.transforms.FrequencyMasking(freq_mask_param=8)
        self.time_mask = torchaudio.transforms.TimeMasking(time_mask_param=15)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.audio_dir, row["filename"])
        label = row["label_idx"]

        audio, _ = load_audio(path)
        audio = random_crop_or_pad(audio)
        mel = audio_to_mel_spectrogram(audio)

        mel = (mel - mel.mean()) / (mel.std() + 1e-6)
        mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0)

        if self.training:
            if torch.rand(1).item() < 0.5:
                mel = self.freq_mask(mel)
            if torch.rand(1).item() < 0.5:
                mel = self.time_mask(mel)

        return mel, torch.tensor(label)

# ============================================
# DataLoaders (macOS safe)
# ============================================
train_dataset = BirdClefDataset(train_df, TRAIN_AUDIO_DIR, training=True)
val_dataset = BirdClefDataset(val_df, TRAIN_AUDIO_DIR, training=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# ============================================
# Simple CNN
# ============================================
class SimpleBirdCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

model = SimpleBirdCNN(NUM_CLASSES).to(DEVICE)

# ============================================
# Loss + optimizer (fixed class weights)
# ============================================
class_counts = train_df["label_idx"].value_counts().sort_index()
weights = 1.0 / class_counts.values
weights = weights * (len(weights) / weights.sum())
weights = torch.tensor(weights, dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# ============================================
# Training loops
# ============================================
def train_one_epoch(model, loader):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for x, y in loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = out.argmax(1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    return total_loss / len(loader), correct / total

def validate_one_epoch(model, loader):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            out = model(x)
            loss = criterion(out, y)

            total_loss += loss.item()
            preds = out.argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    return total_loss / len(loader), correct / total

# ============================================
# Train
# ============================================
for epoch in range(NUM_EPOCHS):
    tr_loss, tr_acc = train_one_epoch(model, train_loader)
    val_loss, val_acc = validate_one_epoch(model, val_loader)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"  Train Loss: {tr_loss:.4f} | Train Acc: {tr_acc:.4f}")
    print(f"  Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.4f}")

# ============================================
# Inference on a single audio file
# ============================================
def predict_audio(path, top_k=5):
    model.eval()
    audio, _ = load_audio(path)
    audio = random_crop_or_pad(audio)
    mel = audio_to_mel_spectrogram(audio)
    mel = (mel - mel.mean()) / (mel.std() + 1e-6)
    mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        out = model(mel)
        probs = torch.softmax(out, dim=1)

    top_probs, top_idxs = torch.topk(probs, top_k)

    for p, i in zip(top_probs[0], top_idxs[0]):
        print(index_to_label[i.item()], f"{p.item():.3f}")

Number of classes: 20
Epoch 1/20
  Train Loss: 2.9975 | Train Acc: 0.0638
  Val   Loss: 2.9911 | Val   Acc: 0.0932
Epoch 2/20
  Train Loss: 2.9649 | Train Acc: 0.0845
  Val   Loss: 2.9299 | Val   Acc: 0.0980
Epoch 3/20
  Train Loss: 2.8603 | Train Acc: 0.1180
  Val   Loss: 2.8058 | Val   Acc: 0.1366
Epoch 4/20
  Train Loss: 2.7653 | Train Acc: 0.1592
  Val   Loss: 2.7322 | Val   Acc: 0.1850
Epoch 5/20
  Train Loss: 2.6857 | Train Acc: 0.1896
  Val   Loss: 2.6770 | Val   Acc: 0.1905
Epoch 6/20
  Train Loss: 2.6076 | Train Acc: 0.2170
  Val   Loss: 2.5737 | Val   Acc: 0.2243
Epoch 7/20
  Train Loss: 2.5261 | Train Acc: 0.2455
  Val   Loss: 2.5306 | Val   Acc: 0.2077
Epoch 8/20
  Train Loss: 2.4456 | Train Acc: 0.2697
  Val   Loss: 2.4311 | Val   Acc: 0.2767
Epoch 9/20
  Train Loss: 2.4049 | Train Acc: 0.2817
  Val   Loss: 2.4144 | Val   Acc: 0.2719
Epoch 10/20
  Train Loss: 2.3423 | Train Acc: 0.3109
  Val   Loss: 2.3326 | Val   Acc: 0.3347
