In [None]:
import numpy as np 
import pandas as pd

from glob import glob
from tqdm import tqdm

import keras
import keras.backend as K
import keras_cv


import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl
import os, gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import tensorflow as tf
import tensorflow_io as tfio

print('TensorFlow version =',tf.__version__)

# USE MULTIPLE GPUS
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)<=1: 
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    print(f'Using {len(gpus)} GPU')
else: 
    strategy = tf.distribute.MirroredStrategy()
    print(f'Using {len(gpus)} GPUs')

In [None]:
cmap = mpl.cm.get_cmap('coolwarm')

In [None]:
class CFG:
    seed = 42
    
    # Input image size and batch size
    img_size = [48, 128, 321]
    batch_size = 128
    
    # Audio duration, sample rate, and length
    duration = 5 # second
    sample_rate = 32000
    audio_len = duration*sample_rate
    
    # STFT parameters
    nfft = 2000
    window = 2000
    hop_length = 500
    fmin = 40
    fmax = 15000
    
    # Number of epochs, model name
    epochs = 10
    preset = 'efficientnetv2_b0_imagenet'
    
    # Data augmentation parameters
    augment=False

    # Class Labels for BirdCLEF 24
    class_names = sorted(os.listdir('/kaggle/input/birdclef-2024/train_audio/'))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()}

In [None]:
BASE_PATH = '/kaggle/input/birdclef-2024'

In [None]:
test_paths = glob(f'{BASE_PATH}/test_soundscapes/*ogg')
# During commit use `unlabeled` data as there is no `test` data.
# During submission `test` data will automatically be populated.
if len(test_paths)==0:
    test_paths = glob(f'{BASE_PATH}/unlabeled_soundscapes/*ogg')[:129]
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df.head()

In [None]:
def build_decoder1(with_labels=False, dim=CFG.audio_len):


    def create_frames(audio, duration=5, sr=32000):
        frame_size = int(duration * sr)
        if np.shape(audio)[0] % frame_size != 0:
            audio = np.pad(audio[..., None], [[0, frame_size - np.shape(audio)[0] % frame_size], [0, 0]]) # pad the end
            audio = np.squeeze(audio) # remove extra dimension added for padding
        frames = np.reshape(audio, [-1, frame_size]) # shape: [num_frames, frame_size]
        return frames

    def decode(path):
        y, _ = librosa.load(path, sr=CFG.sample_rate)
        y = librosa.util.normalize(y)
        audio = np.array(create_frames(y))
        l = []
        for i in range(0,audio.shape[0]):
            spec = librosa.feature.melspectrogram(
                y=audio[i],
                sr=CFG.sample_rate, # sample rate
                n_fft=CFG.nfft, # number of samples in window 
                hop_length=CFG.hop_length, # step size of window
                n_mels=CFG.img_size[1], # horizontal resolution from fmin→fmax in log scale
                fmin=CFG.fmin, # minimum frequency
                fmax=CFG.fmax, # maximum frequency
                power=2.0, # intensity^power for log scale
            )
            # Convert to Db
            spec = librosa.power_to_db(spec, ref=100)
            # Normalize 0-min
            spec = spec - spec.min()
            # Normalize 0-255
            spec = (spec / spec.max() * 255).astype(np.uint8)
            l.append(spec)
    
        return l

    return decode

In [None]:
decoder=build_decoder1()

In [None]:
all_specs = {}
for i in tqdm(range(len(test_df))):
    if (i%100==0)&(i!=0): print(i,', ',end='')
    row=test_df.iloc[i]
    a=row.filepath
    b=np.array(decoder(row.filepath))
    
    all_specs[a] = b

In [None]:
row=test_df.iloc[0]
b=np.array(decoder(row.filepath))
print(b.shape)
plt.figure(figsize=(25, 10))
librosa.display.specshow(b.reshape(b.shape[1],b.shape[0]*b.shape[2]), 
                         x_axis="time", 
                         sr=CFG.sample_rate)
plt.colorbar(format="%+2.f")
plt.show()

In [None]:
def build_decoder(with_labels=True):
    def decode(filepath):
        spec = all_specs[filepath]

        spec = tf.tile(spec[..., None], [1, 1,1,3])
        spec = tf.reshape(spec, [spec.shape[0],CFG.img_size[1],CFG.img_size[2], 3])
        return spec
    
    return decode

In [None]:
class build_dataset(tf.keras.utils.Sequence):
    def __init__(self,filepath, batch_size=64, 
                  decode_fn=None, augment_fn=None,
                  augment=False, shuffle=False):
        self.filepaths = filepath
        self.batch_size = batch_size 
        self.decode_fn = decode_fn
        self.augment = augment 
        self.shuffle = shuffle 
        
        if self.decode_fn is None:
            self.decode_fn = build_decoder()
            
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = int( np.ceil((self.filepaths).shape[0] / self.batch_size ) )
        return ct
    
    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        if self.augment: X = self.__augment_batch(X) 
        return X
    
    def __data_generation(self,indexes):
        X = np.zeros((len(indexes),CFG.img_size[0],CFG.img_size[1],CFG.img_size[2],3),dtype='float32')
        
        for j,i in enumerate(indexes):
            filepath = self.filepaths[i]
            temp = self.decode_fn(filepath)
            X[j,:temp.shape[0],:,:,:]= temp
         
        return X
    
    def __augment_batch(self, img_batch,y):
        img_batch = self.augment_fn(img_batch)     
        return img_batch
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange((self.filepaths).shape[0])
        if self.shuffle: np.random.shuffle(self.indexes)

In [None]:
!pip install --no-index --find-links=/kaggle/input/tf-efficientnet-whl-files /kaggle/input/tf-efficientnet-whl-files/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import efficientnet.tfkeras as efn

def build_model():
    
    inp = tf.keras.Input(shape=(None,None,3))
    base_model = efn.EfficientNetB0(include_top=False, weights=None, input_shape=None)
    base_model.load_weights('/kaggle/input/tf-efficientnet-imagenet-weights/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5')
    x = base_model(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(CFG.num_classes,activation='sigmoid', dtype='float32')(x)
        
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate = 1e-3)
    loss=tf.keras.losses.CategoricalCrossentropy()
    metrics=[tf.keras.metrics.AUC(name='auc',multi_label=True,num_labels=CFG.num_classes)]
    model.compile(loss=loss, optimizer = opt,metrics=metrics) 
        
    return model

In [None]:
model = build_model()
model.load_weights('/kaggle/input/bird-clef-models/EffNet_v1_f1.weights.h5')

In [None]:
ids = []

# Initialize empty array to store predictions
preds = np.empty(shape=(0, CFG.num_classes), dtype='float32')

# Build test dataset
test_paths = test_df.filepath.tolist()
test_ds = build_dataset(test_df.filepath.values,batch_size=128)

# Iterate over each audio file in the test dataset
for idx, specs in enumerate(tqdm(iter(test_ds), desc='test ', total=len(test_ds))):
    # Extract the filename without the extension
    
    
    print(specs.shape)
    # Predict bird species for all frames in a recording using all trained models
    frame_preds = []
    for i in tqdm(range(specs.shape[1])):
        frame_preds.append(model.predict(specs[:,i,:,:,:], verbose=0))
    
    # Create a ID for each frame in a recording using the filename and frame number
    for i in range(idx*CFG.batch_size,(idx+1)*(CFG.batch_size)):
        if i >= len(test_paths):
            break
        filename = test_paths[i].split('/')[-1].replace('.ogg','')
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(frame_preds))]
        ids += frame_ids
        
    frame_preds = np.array(frame_preds)
    frame_preds = frame_preds.reshape((frame_preds.shape[0]*frame_preds.shape[1], 
                                       CFG.num_classes),order = 'F')
    # Concatenate the ids
    
    # Concatenate the predictions
    preds = np.concatenate([preds,(frame_preds)], axis=0)

In [None]:
len(ids)

In [None]:
preds.shape

In [None]:
preds1=preds.copy()

In [None]:
model = build_model()
model.load_weights('/kaggle/input/bird-clef-models/EffNet_v1_f0.weights.h5')
ids = []

# Initialize empty array to store predictions
preds = np.empty(shape=(0, CFG.num_classes), dtype='float32')

# Build test dataset
test_paths = test_df.filepath.tolist()
test_ds = build_dataset(test_df.filepath.values,batch_size=128)

# Iterate over each audio file in the test dataset
for idx, specs in enumerate(tqdm(iter(test_ds), desc='test ', total=len(test_ds))):
    # Extract the filename without the extension
    
    
    print(specs.shape)
    # Predict bird species for all frames in a recording using all trained models
    frame_preds = []
    for i in tqdm(range(specs.shape[1])):
        frame_preds.append(model.predict(specs[:,i,:,:,:], verbose=0))
    
    # Create a ID for each frame in a recording using the filename and frame number
    for i in range(idx*CFG.batch_size,(idx+1)*(CFG.batch_size)):
        if i >= len(test_paths):
            break
        filename = test_paths[i].split('/')[-1].replace('.ogg','')
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(frame_preds))]
        ids += frame_ids
        
    frame_preds = np.array(frame_preds)
    frame_preds = frame_preds.reshape((frame_preds.shape[0]*frame_preds.shape[1], 
                                       CFG.num_classes),order = 'F')
    # Concatenate the ids
    
    # Concatenate the predictions
    preds = np.concatenate([preds,(frame_preds)], axis=0)

In [None]:
preds

In [None]:
preds1

In [None]:
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, CFG.class_names] = (preds+preds1)/2
pred_df.to_csv('submission.csv',index=False)
pred_df.head()