Hello fellow Kagglers,

This notebook demonstrates the inference process and is a work in progress.

Updates will follow soon.

[training notebook](https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-efficientvit-training)

[preprocessing notebook](https://www.kaggle.com/code/markwijkhuizen/birdclef-2024-eda-preprocessed-dataset)

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn
# import noisereduce as nr
from tqdm.notebook import tqdm
import IPython.display as ipd

import torch
import math
import glob
import librosa
import re

# Configuration

In [2]:
class Config():
    # Horizontal melspectrogram resolution
    MELSPEC_H = 128
    # Competition Root Folder
    ROOT_FOLDER = '/kaggle/input/birdclef-2024'
    # Maximum decibel to clip audio to
    TOP_DB = 100
    # Minimum rating
    MIN_RATING = 3.0
    # Sample rate as provided in competition description
    SR = 32000
    N_FFT = 2000
    HOP_LENGTH = 500
    # Model input
    HEIGHT = 128
    WIDTH = 320
    # Duration
    DURATION_S = 240
    WINDOW_S = 5
    N_TEST_CHUNKS = DURATION_S // WINDOW_S
    
CONFIG = Config()

# Sample submission

In [3]:
sample_submission = pd.read_csv('/kaggle/input/birdclef-2024/sample_submission.csv')

# Set labels
CONFIG.LABELS = sample_submission.columns[1:]
CONFIG.N_CLASSES = len(CONFIG.LABELS)
print(f'# classes: {CONFIG.N_CLASSES}')

display(sample_submission.head())

# classes: 182


Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,soundscape_1446779_5,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495
1,soundscape_1446779_10,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495
2,soundscape_1446779_15,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495


In [4]:
# def SpectralNoiseReduction(audio_data, sr, min_length_sec=5):
#     # Skip processing if the length of the audio data is less than the specified minimum length in seconds
#     if len(audio_data) < sr * min_length_sec:
#         return audio_data

#     # Calculate the transition of noise levels across the audio data using a window size of 3 seconds and an overlap of 1.5 seconds
#     hop_length = int(sr * 1.5)  # 1.5 seconds overlap
#     win_length = int(sr * 3)    # 3 seconds window size
#     rms = librosa.feature.rms(y=audio_data, frame_length=win_length, hop_length=hop_length)

#     # Identify the time with the smallest noise level
#     noise_sec = 1  # noise reference length in seconds
#     min_rms_idx = np.argmin(rms)  # index of the minimum RMS value
#     start_idx = min_rms_idx * hop_length
#     end_idx = start_idx + sr * noise_sec  # Extract 1 second of data around the time of minimum noise

#     # Adjust the indices to make sure they are within the bounds of the audio data
#     start_idx = max(0, start_idx)  # Ensure start index is not negative
#     end_idx = min(len(audio_data), end_idx)  # Ensure end index does not exceed the length of the audio data

#     # Use the extracted data as the reference noise data
#     noise_data = audio_data[start_idx:end_idx]

#     # Perform noise reduction
#     return nr.reduce_noise(y=audio_data, sr=sr, y_noise=noise_data)

# OGG → Melspectrogram Conversion

In [5]:
# Convert OOG audio files to melspectrogram encoded as PNG bytes
def ogg2melspectrogram(file_path):
    # Load the audio file
    y, _ = librosa.load(file_path, sr=CONFIG.SR)
#     y = SpectralNoiseReduction(y, CONFIG.SR)
    # Normalize audio
    y = librosa.util.normalize(y)
    # Convert to mel spectrogram
    spec = librosa.feature.melspectrogram(
        y=y,
        sr=CONFIG.SR, # sample rate
        n_fft=CONFIG.N_FFT, # number of samples in window 
        hop_length=CONFIG.HOP_LENGTH, # step size of window
        n_mels=CONFIG.MELSPEC_H, # horizontal resolution from fmin→fmax in log scale
        fmin=40, # minimum frequency
        fmax=15000, # maximum frequency
        power=2.0, # intensity^power for log scale
    )
    # Convert to Db
    spec = librosa.power_to_db(spec, ref=CONFIG.TOP_DB)
    # Normalize 0-min
    spec = spec - spec.min()
    # Normalize 0-255
    spec = (spec / spec.max() * 255).astype(np.uint8)
    
    return spec

# Model

In [6]:
# Count model parameters
def count_parameters(model):
    return sum([p.numel() for p in model.parameters()])

In [7]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        # ImageNet Normalize Input
        self.normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        
        # Backbone
        self.backbone = timm.create_model(
                CONFIG.BACKBONE,
                pretrained=True,
                num_classes=CONFIG.N_CLASSES,
            )
        
    def forward(self, inputs):
        # Go From HxW → 3xHxW
        inputs = inputs.unsqueeze(1).expand(-1, 3, -1, -1)
        # Normalize [0-255] → [0-1]
        inputs = inputs.float() / 255
        # Normalize
        inputs = self.normalize(inputs)
        
        return self.backbone(inputs)

In [8]:
# Load the saved model
model1 = torch.load('/kaggle/input/effvitfinal2steptrain/model.pth', map_location=torch.device('cpu'))
model1 = torch.jit.optimize_for_inference(torch.jit.script(model1.eval()))
# model2 = torch.load('/kaggle/input/final2steptrain/model.pth', map_location=torch.device('cpu'))
# model2 = torch.jit.optimize_for_inference(torch.jit.script(model2.eval()))

# Number of parameters
print(f'# Model Parameters: {count_parameters(model1):,}')

# Model Parameters: 0


In [9]:
def extract_center_samples(spectrogram, sample_width, k):
    _, spectrogram_width = spectrogram.shape
    start_index = (spectrogram_width - sample_width * k) // 2
    samples = []
    for i in range(k):
        sample_start = start_index + i * sample_width
        sample_end = sample_start + sample_width
        samples.append(spectrogram[:,sample_start:sample_end])
    return samples

# Inference Loop

In [10]:
# List to save inference rows in
INFERENCE_ROWS = []

# Hidden test files
if len(glob.glob(f'{CONFIG.ROOT_FOLDER}/test_soundscapes/*.ogg')) > 0:
    ogg_file_paths = glob.glob(f'{CONFIG.ROOT_FOLDER}/test_soundscapes/*.ogg')
else:
    ogg_file_paths = sorted(glob.glob(f'{CONFIG.ROOT_FOLDER}/unlabeled_soundscapes/*.ogg'))[:10]

# Iterate over OGG files
for i, file_path in enumerate(tqdm(ogg_file_paths)):
    # Extract filename
    row_id = re.search(r'/([^/]+)\.ogg$', file_path).group(1)
    # Read OGG file and convert to melspectrogram
    spec = ogg2melspectrogram(file_path)
    h, po = spec.shape
#     print(spec.shape)
    temp = np.zeros((h, math.ceil(po/CONFIG.WIDTH)*CONFIG.WIDTH))
#     print(temp.shape)
#     print(((int(po/CONFIG.WIDTH))*CONFIG.WIDTH))
    temp[:,:((int(po/CONFIG.WIDTH))*CONFIG.WIDTH)] = spec[:,:((int(po/CONFIG.WIDTH))*CONFIG.WIDTH)]
    if(po%CONFIG.WIDTH==0):
        continue
    else:
        work = spec[:,((int(po/CONFIG.WIDTH))*CONFIG.WIDTH):] 
        _, pu = work.shape
#         print(pu)
        while(pu<CONFIG.WIDTH):
            work = np.concatenate((work, work), axis=1)
            pu = pu * 2
        samples = extract_center_samples(work, CONFIG.WIDTH, 1)
#         print(samples[0].shape)
        temp[:,((int(po/CONFIG.WIDTH))*CONFIG.WIDTH):] = samples[0]
    spec = temp.reshape(CONFIG.HEIGHT,-1,CONFIG.WIDTH).transpose([1,0,2])
    # Convert spec from Numpy array on CPU to Torch Tensor on GPU
#     print(spec.shape)
    spec = torch.Tensor(spec)
    # Predict
    with torch.no_grad():
        outputs = model1(spec).numpy()
#         outputs2 = model2(spec).softmax(dim=1).numpy()
#         outputs = outputs1 + outputs2
    # Add to inference rows and limit to 4 minutes
    for t, o in zip(range(CONFIG.N_TEST_CHUNKS), outputs):
        # Predictions for each bird
        predictions = dict([ (l,p) for l, p in zip(CONFIG.LABELS, o) ])
        # Append to inference rows
        INFERENCE_ROWS.append(
            { 'row_id': f'{row_id}_{(t+1)*5}' } | predictions
        )

  0%|          | 0/10 [00:00<?, ?it/s]

# Submission

In [11]:
# Create pandas DataFrame from inference rows
submission_df = pd.DataFrame(INFERENCE_ROWS)

# Display submission DataFrame
display(submission_df.head(30))

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,1000170626_5,-8.929309,-6.077578,-7.637516,-7.513359,-5.335187,-9.21882,-10.60238,-9.541398,-9.603637,...,-9.853653,-6.225093,-5.341549,-8.926751,-6.253828,-5.164371,-10.256872,-7.322828,-10.558577,-5.557906
1,1000170626_10,-9.335729,-6.122895,-7.437583,-7.991006,-5.179772,-9.935601,-9.509924,-9.343244,-9.058521,...,-8.539062,-8.806579,-4.395478,-8.792713,-5.961372,-5.941516,-11.082971,-7.979632,-10.674315,-5.589745
2,1000170626_15,-10.60802,-8.517084,-8.181373,-9.940133,-6.774269,-10.540535,-9.548855,-10.004525,-8.688011,...,-8.667253,-10.269005,-5.395493,-8.891125,-6.495146,-5.578742,-11.292712,-8.875021,-10.674853,-6.075771
3,1000170626_20,-8.740638,-5.974727,-6.507977,-7.509148,-5.994956,-10.298247,-8.989426,-9.333335,-9.039713,...,-9.358559,-8.720445,-5.97178,-8.452797,-6.697258,-6.4496,-10.760261,-8.349264,-10.134686,-5.664535
4,1000170626_25,-10.796465,-8.674109,-7.219454,-9.06789,-6.091533,-10.657548,-9.593559,-10.347894,-9.796329,...,-8.776675,-10.368563,-5.544365,-8.746628,-7.170969,-5.796434,-11.526701,-8.425641,-10.293088,-5.82719
5,1000170626_30,-8.912533,-6.033548,-7.803911,-7.665478,-4.854063,-9.106267,-9.145471,-9.102528,-9.128641,...,-9.332915,-9.301294,-5.21958,-8.731805,-6.597445,-5.734265,-11.064372,-8.771551,-9.03011,-5.909963
6,1000170626_35,-9.506185,-6.338598,-7.600972,-8.132229,-6.856262,-10.345852,-9.400348,-10.330134,-8.664582,...,-8.858374,-9.308369,-5.664603,-8.236632,-6.308828,-6.706014,-9.989302,-8.521395,-10.463577,-6.575538
7,1000170626_40,-9.544999,-5.53356,-6.977248,-6.627059,-5.514023,-9.373538,-10.336889,-10.348187,-9.684292,...,-8.524874,-7.537401,-5.771317,-9.131155,-5.750741,-6.529095,-9.248564,-8.957293,-9.893583,-5.788542
8,1000170626_45,-8.732911,-3.984619,-6.602977,-5.008765,-5.148103,-8.730187,-9.844782,-9.230995,-8.397202,...,-8.455856,-7.908882,-4.583226,-8.376232,-5.021188,-5.908278,-8.840219,-8.211058,-9.944453,-5.689992
9,1000170626_50,-10.934342,-9.128482,-8.104716,-10.10853,-6.22085,-11.002259,-10.332516,-10.4544,-9.606098,...,-9.361746,-10.565107,-5.255722,-9.221024,-7.012263,-5.611922,-11.990186,-9.585859,-11.172178,-6.294067


In [12]:
# Write CSV
submission_df.to_csv('submission.csv', index=False)