In [1]:
import os
import copy
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Audio, display

from src.models.classification import *
from src.models.generative import *
from src.preprocessing import *
from src.display import *
from src.metrics import *
from src.utils import *

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import ConfusionMatrixDisplay, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import KFold, ParameterGrid, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Usando dispositivo: mps


In [2]:
SPEC_CONFIG = {
    "SR": 2000,              # sampling rate
    "FFT_SAMPLES": 256,      # tamaño de la ventana
    "HOP_LENGTH": 63,        # salto entre frames
    "MEL_BINS": 64,          # frequency bins (resolución)
    "MAX_FREQ": 600          # máxima frecuencia para los espectrogramas
}

DATA_AUG_CONFIG = {
    'TIME_STRETCH_FACTORS': [0.9, 1.1],     # ±10%
    'PITCH_SHIFTS': [-2, 2],                # ±2 semitonos
    'NOISE_LEVEL': 0.001                    # ruido leve
}

SEED = 3    # semilla para reproducibilidad

In [3]:
train_dir = 'data/whale-detection-challenge/data/train'
test_dir = 'data/whale-detection-challenge/data/test'
labels_dir = 'data/whale-detection-challenge/data/train.csv'

audio_df, labels_df, test_files = load_data(train_dir, test_dir, labels_dir, sampling_rate=SPEC_CONFIG['SR'])
print(audio_df.columns)

Index(['clip_name', 'label', 'filepath', 'audio'], dtype='object')


In [4]:
normalize(audio_df, 'audio') # chequear esta linea

In [5]:
train_df, val_df = train_test_split(audio_df, test_size=0.2, random_state=SEED, stratify=audio_df['label'])

In [6]:
batch_size = 128
device = torch.device("mps" if torch.mps.is_available() else "cpu")
print('Using device:', device)

Using device: mps


In [7]:
train_mel_spec = []
for whale_audio in train_df['audio']: # emprolijar esto de abajo
    mel = get_melspectrogram(whale_audio, SPEC_CONFIG['SR'], SPEC_CONFIG['FFT_SAMPLES'], SPEC_CONFIG['HOP_LENGTH'], SPEC_CONFIG['MEL_BINS'], SPEC_CONFIG['MAX_FREQ']) # shape: (64, 64)
    mel = mel[np.newaxis, :, :] # shape: (1, 64, 64)
    train_mel_spec.append(mel)

In [8]:
val_mel_spec = []
for whale_audio in val_df['audio']: # emprolijar esto de abajo
    mel = get_melspectrogram(whale_audio, SPEC_CONFIG['SR'], SPEC_CONFIG['FFT_SAMPLES'], SPEC_CONFIG['HOP_LENGTH'], SPEC_CONFIG['MEL_BINS'], SPEC_CONFIG['MAX_FREQ']) # shape: (64, 64)
    mel = mel[np.newaxis, :, :] # shape: (1, 64, 64)
    val_mel_spec.append(mel)

In [9]:
X_train = np.array(train_mel_spec)
X_val = np.array(val_mel_spec)

In [14]:
X_train_std, X_val_std, train_mean, train_std = standarize_train_val(X_train, X_val)

Mean: -22.267026901245117, std: 10.529284487233888


In [15]:
y_train = train_df['label'].values
y_val = val_df['label'].values

In [16]:
X_train_tensor = torch.tensor(X_train_std, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_std, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=batch_size)

#### CNN

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import copy

def get_class_weights(y):
    # Simple helper to compute class weights
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np
    classes = np.unique(y)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    return torch.tensor(weights, dtype=torch.float)

class SpectrogramCNN(nn.Module):
    def __init__(self, output_dim=2):
        super(SpectrogramCNN, self).__init__()
        self.device = torch.device('mps' if torch.backends.mps.is_available() else
                                   'cuda' if torch.cuda.is_available() else 'cpu')
        
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),  # (1, 64, 64) -> (16, 64, 64)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                         # (16, 64, 64) -> (16, 32, 32)

            nn.Conv2d(16, 32, kernel_size=3, padding=1), # -> (32, 32, 32)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                          # -> (32, 16, 16)

            nn.Conv2d(32, 64, kernel_size=3, padding=1), # -> (64, 16, 16)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                          # -> (64, 8, 8)
        )

        self.classifier = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten
        return self.classifier(x)

    def train_model(self, train_loader, val_loader, epochs=50, lr=0.001, weight_decay=1e-5,
                    early_stopping_patience=None, use_class_weights=True):
        self.to(self.device)

        if use_class_weights:
            y_train = train_loader.dataset.tensors[1].cpu().numpy().flatten()
            weights = get_class_weights(y_train).to(self.device)
            loss_function = nn.CrossEntropyLoss(weight=weights)
        else:
            loss_function = nn.CrossEntropyLoss()

        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

        train_losses, val_losses = [], []
        best_val_loss, patience_counter = float('inf'), 0
        best_model_state = None

        for epoch in range(epochs):
            self.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = loss_function(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)

            avg_train_loss = running_loss / len(train_loader.dataset)
            train_losses.append(avg_train_loss)

            val_loss, val_acc, val_f1, val_auc = self.evaluate(val_loader, return_metrics=True)
            val_losses.append(val_loss)

            print(f'Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | '
                  f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')

            if early_stopping_patience is not None:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model_state = copy.deepcopy(self.state_dict())
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= early_stopping_patience:
                        print("Early stopping triggered (val loss did not improve).")
                        break

        if best_model_state is not None:
            self.load_state_dict(best_model_state)

        return train_losses, val_losses

    def evaluate(self, val_loader, return_metrics=False):
        self.eval()
        val_loss = 0.0
        all_labels, all_probs, all_preds = [], [], []

        loss_function = nn.CrossEntropyLoss()
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self(inputs)
                loss = loss_function(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                probs = torch.softmax(outputs, dim=1)[:, 1]
                preds = torch.argmax(outputs, dim=1)

                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader.dataset)
        acc = accuracy_score(all_labels, all_preds)
        try:
            auc = roc_auc_score(all_labels, all_probs)
        except ValueError:
            auc = 0.0
        f1 = f1_score(all_labels, all_preds)

        if return_metrics:
            return avg_val_loss, acc * 100, f1, auc

        print(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {acc * 100:.2f}%, F1: {f1:.4f}, AUC: {auc:.4f}')


In [18]:
CNN = SpectrogramCNN(output_dim=2)
CNN.train_model(train_loader, val_loader, epochs=50, early_stopping_patience=5)
CNN.evaluate(val_loader) # el criterio de early stopping deberia ser por loss de val, no por f1score

Epoch 1/50 | Train Loss: 0.3247 | Val Loss: 0.2768 | Val Acc: 88.03% | Val F1: 0.7896 | Val AUC: 0.9600
Epoch 2/50 | Train Loss: 0.2154 | Val Loss: 0.2044 | Val Acc: 90.23% | Val F1: 0.8181 | Val AUC: 0.9702
Epoch 3/50 | Train Loss: 0.1911 | Val Loss: 0.1987 | Val Acc: 90.87% | Val F1: 0.8281 | Val AUC: 0.9738
Epoch 4/50 | Train Loss: 0.1760 | Val Loss: 0.1982 | Val Acc: 91.22% | Val F1: 0.8330 | Val AUC: 0.9750
Epoch 5/50 | Train Loss: 0.1652 | Val Loss: 0.1888 | Val Acc: 91.78% | Val F1: 0.8409 | Val AUC: 0.9755
Epoch 6/50 | Train Loss: 0.1522 | Val Loss: 0.1907 | Val Acc: 92.35% | Val F1: 0.8491 | Val AUC: 0.9758
Epoch 7/50 | Train Loss: 0.1408 | Val Loss: 0.2061 | Val Acc: 92.42% | Val F1: 0.8497 | Val AUC: 0.9756
Epoch 8/50 | Train Loss: 0.1296 | Val Loss: 0.1988 | Val Acc: 92.23% | Val F1: 0.8443 | Val AUC: 0.9743
Epoch 9/50 | Train Loss: 0.1166 | Val Loss: 0.2205 | Val Acc: 91.85% | Val F1: 0.8427 | Val AUC: 0.9752
Epoch 10/50 | Train Loss: 0.0990 | Val Loss: 0.2249 | Val Acc: 9

#### VAE

In [19]:
# VARIATIONAL AUTOENCODER MODEL

class BetaVAE(nn.Module):
    def __init__(self, latent_dim=32):
        super(BetaVAE, self).__init__()

        # Encoder
        self.enc = nn.Sequential(
            # (1, 64, 64) -> (32, 32, 32)
            nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (32, 32, 32) -> (64, 16, 16)
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (64, 16, 16) -> (128, 8, 8)
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (128, 8, 8) -> (256, 4, 4)
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
        )
        self.fc_mu = nn.Linear(256 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(256 * 4 * 4, latent_dim)

        # Decoder
        self.decoder_fc = nn.Linear(latent_dim, 256 * 4 * 4)
        self.dec = nn.Sequential(
            # (256, 4, 4) -> (128, 8, 8)
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (128, 8, 8) -> (64, 16, 16)
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (64, 16, 16) -> (32, 32, 32)
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            # (32, 32, 32) -> (1, 64, 64)
            nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1),
            # Sin activación final para MSE loss (usá Sigmoid para [0,1])
        )

    def encode(self, x):
        h = self.enc(x)
        h = h.view(h.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = self.decoder_fc(z).view(-1, 256, 4, 4)
        return self.dec(h)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar
        
    def bvae_loss(self, recon_x, x, mu, logvar, beta=1.0):
        recon_loss = F.mse_loss(recon_x, x, reduction='sum')
        kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return recon_loss + beta * kld_loss, recon_loss, kld_loss


In [20]:
def load_model(path="saved_models/bvae.pt"):
    device = torch.device("mps" if torch.mps.is_available() else "cpu")
    model = BetaVAE().to(device)
    state_dict = torch.load(path, map_location=device, weights_only=True)
    model.load_state_dict(state_dict)
    model.eval()
    print(f"[✔] Modelo cargado desde: {path}")
    return model

BVAE = load_model()

[✔] Modelo cargado desde: saved_models/bvae.pt


#### DATA AUGMENTATION

In [21]:
datos_totales = len(y_train)
audios_ballena = np.count_nonzero(y_train)
audios_ruido = datos_totales - audios_ballena
ballenas_faltantes = audios_ruido - audios_ballena

print(f'De los {datos_totales} datos que se usan para entrenar: {audios_ballena} son de ballena y {audios_ruido} son de ruido, por ende falta completar con {ballenas_faltantes} audios de ballena.')

De los 24000 datos que se usan para entrenar: 5622 son de ballena y 18378 son de ruido, por ende falta completar con 12756 audios de ballena.


In [22]:
print(f'X_train shape: {(X_train.shape)}')
print(f'y_train shape: {(y_train.shape)}')
latent_dim = 32

X_train shape: (24000, 1, 64, 64)
y_train shape: (24000,)


In [23]:
n_synth = ballenas_faltantes
with torch.no_grad():
    z = torch.randn(n_synth, latent_dim).to(device)
    synth_specs = BVAE.decode(z).cpu().numpy()  # shape: (n_synth, 1, 64, 64)
    synth_specs = synth_specs * 11.347515106201172 + -21.889528274536133 # estos son el mean y std con los q se estandarizaron los datos de train del vae

In [24]:
synth_labels = np.ones(n_synth, dtype=int)

In [25]:
X_train_aug = np.concatenate([X_train, synth_specs], axis=0)
y_train_aug = np.concatenate([y_train, synth_labels], axis=0)

In [26]:
X_train_aug_std = (X_train_aug - train_mean) / train_std

In [27]:
aug_X_train_tensor = torch.tensor(X_train_aug_std, dtype=torch.float32)
aug_y_train_tensor = torch.tensor(y_train_aug, dtype=torch.long)

aug_train_loader = DataLoader(TensorDataset(aug_X_train_tensor, aug_y_train_tensor), batch_size=batch_size, shuffle=True)

In [28]:
CNN = SpectrogramCNN(output_dim=2)
CNN.train_model(train_loader, val_loader, epochs=50, early_stopping_patience=5)
CNN.evaluate(val_loader)

Epoch 1/50 | Train Loss: 0.3422 | Val Loss: 0.2923 | Val Acc: 87.88% | Val F1: 0.7838 | Val AUC: 0.9581
Epoch 2/50 | Train Loss: 0.2228 | Val Loss: 0.2062 | Val Acc: 90.25% | Val F1: 0.8175 | Val AUC: 0.9702
Epoch 3/50 | Train Loss: 0.1933 | Val Loss: 0.2230 | Val Acc: 91.03% | Val F1: 0.8291 | Val AUC: 0.9721
Epoch 4/50 | Train Loss: 0.1788 | Val Loss: 0.2111 | Val Acc: 91.47% | Val F1: 0.8344 | Val AUC: 0.9734
Epoch 5/50 | Train Loss: 0.1700 | Val Loss: 0.1777 | Val Acc: 92.50% | Val F1: 0.8479 | Val AUC: 0.9742
Epoch 6/50 | Train Loss: 0.1607 | Val Loss: 0.1792 | Val Acc: 92.08% | Val F1: 0.8438 | Val AUC: 0.9753
Epoch 7/50 | Train Loss: 0.1482 | Val Loss: 0.1871 | Val Acc: 92.85% | Val F1: 0.8545 | Val AUC: 0.9756
Epoch 8/50 | Train Loss: 0.1364 | Val Loss: 0.2164 | Val Acc: 91.07% | Val F1: 0.8306 | Val AUC: 0.9732
Epoch 9/50 | Train Loss: 0.1219 | Val Loss: 0.2302 | Val Acc: 91.90% | Val F1: 0.8415 | Val AUC: 0.9743
Epoch 10/50 | Train Loss: 0.1108 | Val Loss: 0.2050 | Val Acc: 9