[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nagocs/birdclef-challenge/blob/main/training_and_evaluation.ipynb)

In [4]:
!pip install -q gdown

import os

print("Letöltés...")

# train_audio.zip
zip_id = "1cxXQ1E-a3Rw0GMEeq0Vaq_3ZvN-dsumN"
zip_out = "train_audio.zip"

# train_metadata.csv
csv_id = "1ylq0SPuidYMHV3JqL3LV12zyf2Bp6WrN"
csv_out = "train_metadata.csv"

!gdown --id $zip_id -O $zip_out
!gdown --id $csv_id -O $csv_out

print("Letöltés kész!")

print("\nFájlméretek:")
!ls -lh train_audio.zip
!ls -lh train_metadata.csv

print("\nKicsomagolás...")
os.makedirs("train_audio", exist_ok=True)
!unzip -q train_audio.zip -d birdclef_2024

print("Kész! A dataset a 'birdclef_2024/' és 'train_metadata.csv' helyeken található.")


Letöltés...
Downloading...
From (original): https://drive.google.com/uc?id=1cxXQ1E-a3Rw0GMEeq0Vaq_3ZvN-dsumN
From (redirected): https://drive.google.com/uc?id=1cxXQ1E-a3Rw0GMEeq0Vaq_3ZvN-dsumN&confirm=t&uuid=b4def99b-a8b0-437c-acea-269aa380c1b1
To: /content/train_audio.zip
100% 7.78G/7.78G [01:37<00:00, 79.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ylq0SPuidYMHV3JqL3LV12zyf2Bp6WrN
To: /content/train_metadata.csv
100% 5.13M/5.13M [00:00<00:00, 175MB/s]
Letöltés kész!

Fájlméretek:
-rw-r--r-- 1 root root 7.3G Nov 27 10:16 train_audio.zip
-rw-r--r-- 1 root root 4.9M Nov  1 18:38 train_metadata.csv

Kicsomagolás...
Kész! A dataset a 'train_audio/' és 'train_metadata.csv' helyeken található.


In [20]:
# ---------------------------------------------------------
# 0. Libraries
# ---------------------------------------------------------
import os
import numpy as np
import pandas as pd
import librosa

from sklearn.model_selection import train_test_split
import tensorflow as tf


# ---------------------------------------------------------
# 1. CONFIG class
# ---------------------------------------------------------
class CONFIG:
    seed = 42

    img_size = [128, 384]
    batch_size = 32

    duration = 15
    sample_rate = 32000
    audio_len = duration * sample_rate

    nfft = 2028
    window = 2048
    hop_length = audio_len // (img_size[1] - 1)
    fmin = 20
    fmax = 16000

    augment = True

    class_names = sorted(os.listdir('birdclef_2024/train_audio/'))
    num_classes = len(class_names)

    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v: k for k, v in label2name.items()}


# ---------------------------------------------------------
# 2. Metadata
# ---------------------------------------------------------
df = pd.read_csv("train_metadata.csv")
df["filepath"] = "birdclef_2024/train_audio/" + df["filename"]
df["target"] = df["primary_label"].map(CONFIG.name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df["xc_id"] = df["filename"].apply(lambda x: x.split("/")[-1].split(".")[0])

num_classes = df["target"].nunique()

# ---------------------------------------------------------
# 3. Audio file
# ---------------------------------------------------------
def load_audio(filepath):
    audio, sr = librosa.load(filepath, sr=CONFIG.sample_rate)
    return audio, sr


# ---------------------------------------------------------
# 4. Spectrogram
# ---------------------------------------------------------
def get_spectrogram(audio):
    spec = librosa.feature.melspectrogram(
        y=audio,
        sr=CONFIG.sample_rate,
        n_mels=256,
        n_fft=CONFIG.nfft,
        hop_length=CONFIG.hop_length,
        fmax=CONFIG.fmax,
        fmin=CONFIG.fmin,
    )

    spec = librosa.power_to_db(spec, ref=1.0)

    min_db = spec.min()
    max_db = spec.max()
    if max_db != min_db:
        spec = (spec - min_db) / (max_db - min_db)

    return spec


# ---------------------------------------------------------
# 5. Train / Validation / Test Split
# ---------------------------------------------------------
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=CONFIG.seed,
    stratify=df["target"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=CONFIG.seed,
)

# ---------------------------------------------------------
# 6. Keras Dataset Generator
# ---------------------------------------------------------
class KerasAudioDataset(tf.keras.utils.Sequence):

    def __init__(self, dataframe, batch_size=32, augment=False, **kwargs):
        super().__init__(**kwargs)
        self.df = dataframe.reset_index(drop=True)
        self.batch_size = batch_size
        self.augment = augment

    def __len__(self):
        return int(np.ceil(len(self.df) / self.batch_size))

    def __getitem__(self, index):
        batch_df = self.df.iloc[index*self.batch_size : (index+1)*self.batch_size]

        specs = []
        labels = []

        for _, row in batch_df.iterrows():
            audio, sr = load_audio(row.filepath)

            if len(audio) < CONFIG.audio_len:
                pad_len = CONFIG.audio_len - len(audio)
                audio = np.pad(audio, (0, pad_len))
            else:
                audio = audio[:CONFIG.audio_len]

            spec = get_spectrogram(audio)

            if self.augment:
                spec = self.apply_augment(spec)

            spec = np.expand_dims(spec, axis=-1).astype(np.float32)

            specs.append(spec)
            labels.append(row.target)

        return np.array(specs), np.array(labels)

    def apply_augment(self, spec):
        if np.random.rand() < 0.3:
            spec = spec * np.random.uniform(0.7, 1.3)
        return spec


# ---------------------------------------------------------
# 7. Create training and validation set
# ---------------------------------------------------------
train_dataset = KerasAudioDataset(
    train_df,
    batch_size=CONFIG.batch_size,
    augment=CONFIG.augment
)

val_dataset = KerasAudioDataset(
    val_df,
    batch_size=CONFIG.batch_size,
    augment=False
)

test_dataset = KerasAudioDataset(
    test_df,
    batch_size=CONFIG.batch_size,
    augment=False
)

X, y = train_dataset[0]
print("Batch shape:", X.shape, y.shape)
print("Train set:", len(train_df), "=", len(train_dataset), "batch")
print("Validation set:", len(val_df), "=", len(val_dataset), "batch")
print("Test set:", len(test_df), "=", len(test_dataset), "batch")
print("Number of classes:", num_classes)


Batch shape: (32, 256, 384, 1) (32,)
Train set: 19567 = 612 batch
Validation set: 2446 = 77 batch
Test set: 2446 = 77 batch
Number of classes: 182


In [21]:
import tensorflow as tf
from tensorflow.keras import layers, Model

# ----------------------------------------------------
#   Fused-MBConv blokk
# ----------------------------------------------------
def fused_mbconv(x, expand_channels, out_channels, stride):
    inp = x

    # Expand 3×3 conv (fused: nincs 1×1 expand)
    x = layers.Conv2D(expand_channels, 3, stride, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    # Projection 1×1
    x = layers.Conv2D(out_channels, 1, 1, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)

    # Skip connection
    if stride == 1 and inp.shape[-1] == out_channels:
        x = layers.Add()([x, inp])

    return x

# ----------------------------------------------------
#   MBConv blokk (SE-vel)
# ----------------------------------------------------
def mbconv(x, expand_channels, out_channels, stride, se_ratio=0.25):
    inp = x
    inp_channels = x.shape[-1]

    # Expand 1×1
    x = layers.Conv2D(expand_channels, 1, 1, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    # Depthwise conv
    x = layers.DepthwiseConv2D(3, stride, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    # Squeeze and Excitation
    se = layers.GlobalAveragePooling2D()(x)
    se = layers.Dense(int(expand_channels * se_ratio), activation='swish')(se)
    se = layers.Dense(expand_channels, activation='sigmoid')(se)
    se = layers.Reshape((1, 1, expand_channels))(se)
    x = layers.Multiply()([x, se])

    # Projection 1×1
    x = layers.Conv2D(out_channels, 1, 1, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)

    # Skip connection
    if stride == 1 and inp_channels == out_channels:
        x = layers.Add()([x, inp])

    return x


# ----------------------------------------------------
#   EfficientNetV2-B0
# ----------------------------------------------------
def EfficientNetV2B0(input_shape = X.shape[1:], num_classes=num_classes):
    inputs = layers.Input(shape=input_shape)
    x = inputs

    # Stem
    x = layers.Conv2D(32, 3, 2, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    # Stage 1: 2× Fused-MBConv
    for _ in range(2):
        x = fused_mbconv(x, expand_channels=32, out_channels=16, stride=1)

    # Stage 2: 4× Fused-MBConv, stride=2 at first
    for i in range(4):
        x = fused_mbconv(x, expand_channels=64, out_channels=32, stride=2 if i == 0 else 1)

    # Stage 3: 4× Fused-MBConv
    for i in range(4):
        x = fused_mbconv(x, expand_channels=96, out_channels=48, stride=2 if i == 0 else 1)

    # Stage 4: 6× MBConv
    for i in range(6):
        x = mbconv(x, expand_channels=192, out_channels=96, stride=2 if i == 0 else 1)

    # Stage 5: 9× MBConv
    for i in range(9):
        x = mbconv(x, expand_channels=384, out_channels=112, stride=1)

    # Stage 6: 15× MBConv
    for i in range(15):
        x = mbconv(x, expand_channels=768, out_channels=192, stride=2 if i == 0 else 1)

    # Head
    x = layers.Conv2D(1280, 1, 1, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(num_classes, activation='softmax')(x)

    return Model(inputs, x)


# ----------------------------------------------------
# Példa használatra:
# ----------------------------------------------------
model = EfficientNetV2B0()
#model.summary()
total_params = model.count_params()
print("Összes paraméter:", total_params)


Összes paraméter: 11598934


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

callbacks = [
    ModelCheckpoint("best_cnn.keras", save_best_only=True, monitor="val_loss"),
    EarlyStopping(patience=5, restore_best_weights=True)
]

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=callbacks
)


In [None]:
# ----------------------------------------------------
# Kiértékelés
# ----------------------------------------------------

model = tf.keras.models.load_model("best_cnn.keras")

loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")