In [1]:
import numpy as np
import random
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from data import BirdClefTrainAudio, BirdClefHarmonics
from model import HarmonicModel
from mimir import training
import time
import pickle
import os

In [2]:
np.random.seed(20250403)
random.seed(20250403)
torch.manual_seed(20250403)
torch.cuda.manual_seed(20250403)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [3]:
data_folder = "data"

In [4]:
audio = BirdClefTrainAudio(data_folder, max_duration=5, sr=16000)
cachefile="data.pkl"
if os.path.isfile(cachefile):
    with open(cachefile, "rb") as f:
        ds = pickle.load(f)
else:
    start = time.time()
    ds = BirdClefHarmonics(audio, fmin=500, fmax=4000)
    print(f"Loading data took {time.time()-start} seconds")
    with open(cachefile, "wb") as f:
        pickle.dump(ds, f)

In [5]:
hps = training.HyperParameters(model_params={'n_labels': audio.n_labels, 'n_harmonics': 10, "num_filter_maps": 128}, 
                               optimizer_params={'lr': 1e-3})

In [6]:
loss = nn.CrossEntropyLoss(weight=torch.tensor(audio.label_weights()).to(training.DEVICE))

In [None]:
results, model = training.train(data=ds, model_class = HarmonicModel, hyper_params=hps, loss_fn = loss,
                                name="convmodel", pad=True, batch_size=16)

Epoch   0: Loss=5.25743 val, 5.23250 train 
Epoch   1: Loss=5.24952 val, 5.21542 train 
Epoch   2: Loss=5.25050 val, 5.20460 train 
Epoch   3: Loss=5.24500 val, 5.19536 train 
loss: 5.244151, [ 24016/ 25708]