In [2]:
import numpy as np
import tensorflow as tf
import librosa

In [3]:
from birdset.datamodule import DatasetConfig
from birdset.datamodule.birdset_datamodule import BirdSetDataModule
from birdset.datamodule import LoadersConfig, LoaderConfig

dm = BirdSetDataModule(
    dataset=DatasetConfig(
        data_dir="./datasets",
        hf_path="DBD-research-group/BirdSet",
        hf_name="POW",
        n_workers=21,
        val_split=0.2,
        task="multiclass",
        classlimit=500,
        eventlimit=5,
        sample_rate=32000,
    ),
    loaders=LoadersConfig(
        train=LoaderConfig(batch_size=8, shuffle=True),
        valid=LoaderConfig(batch_size=8, shuffle=False),
        test=LoaderConfig(batch_size=8, shuffle=True),
    ),
)

dm.prepare_data()
dm.setup(stage="fit")

train_loader = dm.train_dataset
validation_loader = dm.val_dataset


  torchaudio.set_audio_backend("soundfile")
sampling: 100%|██████████| 48/48 [00:08<00:00,  5.64it/s]


Saving the dataset (0/1 shards):   0%|          | 0/41115 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10279 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16052 [00:00<?, ? examples/s]

In [4]:
import librosa

def preprocess_audio(audio_path):
        sample_rate = 32000
        audio, sr = librosa.load(audio_path, sr=sample_rate, mono=True)
        
        if len(audio) > 32000:
            start = (len(audio) - 32000) // 2
            audio = audio[start:start + 32000]
        elif len(audio) < 32000:
            audio = np.pad(audio, (0, 32000 - len(audio)), mode='constant')
        
        audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)
        
        stft = tf.signal.stft(audio_tensor, frame_length=2048, frame_step=512, fft_length=2048)
        spectrogram = tf.abs(stft)
        
        num_spectrogram_bins = spectrogram.shape[-1]
        lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
        num_mel_bins = 128
        linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, sample_rate, 
            lower_edge_hertz, upper_edge_hertz
        )
        mel_spectrogram = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1)
        mel_spectrogram.set_shape(spectrogram.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        
        log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
        
        spectrogram = log_mel_spectrogram.numpy().T
        
        spectrogram = spectrogram[..., np.newaxis]
        
        spectrogram_resized = tf.image.resize(spectrogram, [224, 224], method='bilinear').numpy()
        
        spectrogram_3ch = np.repeat(spectrogram_resized, 3, axis=-1)
        
        spec_min = spectrogram_3ch.min()
        spec_max = spectrogram_3ch.max()
        spectrogram_3ch = (spectrogram_3ch - spec_min) / (spec_max - spec_min + 1e-8)
        
        return spectrogram_3ch.astype(np.float32)

In [5]:
# Load birdset_model.h5
model = tf.keras.models.load_model('birdset_model.h5')

def predict(audio_path, top_k=5):
    spectrogram = preprocess_audio(audio_path)
    spectrogram = np.expand_dims(spectrogram, axis=0)  # Add batch dimension
    predictions = model.predict(spectrogram)
    top_indices = np.argsort(predictions[0])[-top_k:][::-1]
    top_probs = predictions[0][top_indices]
    int2str = dm.train_dataset.features['labels'].int2str
    
    results = [(int2str(int(idx)), float(prob)) for idx, prob in zip(top_indices, top_probs)]
    return results

In [6]:
predictions = predict("./XC180282 - Northern Cardinal - Cardinalis cardinalis.mp3")

print(predictions)

  from pkg_resources import resource_filename


[('norcar', 0.872824490070343), ('amerob', 0.1103929653763771), ('comrav', 0.015611249022185802), ('bnhcow', 0.0007253950461745262), ('swathr', 0.0004327567294239998)]
