In [1]:
# import os
# import pandas as pd
# import torchaudio
# import torchaudio.transforms as T
# import matplotlib.pyplot as plt
# import numpy as np
# from PIL import Image
# from tqdm import tqdm

# def save_mel_spectrograms(audio_dir, metadata_csv, output_dir,
#                           target_sr=32000, n_mels=128):
#     # Load mapping
#     metadata = pd.read_csv(metadata_csv)
#     label_to_name = (
#         metadata[["primary_label", "common_name"]]
#         .drop_duplicates()
#         .set_index("primary_label")["common_name"]
#         .to_dict()
#     )

#     mel_transform = T.MelSpectrogram(
#         sample_rate=target_sr,
#         n_mels=n_mels
#     )

#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)

#     # Iterate over folders (primary labels)
#     for label_folder in tqdm(os.listdir(audio_dir), desc="Processing labels"):
#         folder_path = os.path.join(audio_dir, label_folder)
#         if not os.path.isdir(folder_path):
#             continue

#         common_name = label_to_name.get(label_folder, label_folder)
#         common_folder = os.path.join(output_dir, common_name)
#         os.makedirs(common_folder, exist_ok=True)

#         for fname in os.listdir(folder_path):
#             if not fname.endswith(".ogg"):
#                 continue

#             file_path = os.path.join(folder_path, fname)
#             waveform, sr = torchaudio.load(file_path)

#             # Resample if needed
#             if sr != target_sr:
#                 resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
#                 waveform = resampler(waveform)

#             # Convert to mono (optional)
#             if waveform.shape[0] > 1:
#                 waveform = waveform.mean(dim=0, keepdim=True)

#             # Mel spectrogram
#             mel_spec = mel_transform(waveform)
#             mel_spec = torchaudio.functional.amplitude_to_DB(
#                         mel_spec, 
#                         multiplier=20.0, 
#                         amin=1e-10, 
#                         db_multiplier=0.0
#                     )

#             # Normalize to 0–255 and convert to image
#             mel_spec = mel_spec.squeeze().numpy()
#             mel_spec -= mel_spec.min()
#             mel_spec /= mel_spec.max()
#             mel_spec = (mel_spec * 255).astype(np.uint8)

#             # Save as PNG
#             img = Image.fromarray(mel_spec)
#             img = img.convert("L")  # grayscale
#             out_name = os.path.splitext(fname)[0] + ".png"
#             img.save(os.path.join(common_folder, out_name))


In [2]:
# import os
# import shutil
# import random
# from pathlib import Path

# # Paths
# DATASET_DIR = Path("spectrogram_dataset")
# OUTPUT_DIR = Path("splitted_dataset")
# TRAIN_DIR = OUTPUT_DIR / "train"
# TEST_DIR = OUTPUT_DIR / "test"

# # Split ratio
# test_ratio = 0.2
# random.seed(42)

# # Create output directories
# for split_dir in [TRAIN_DIR, TEST_DIR]:
#     split_dir.mkdir(parents=True, exist_ok=True)

# # Iterate over each label folder
# for label_dir in DATASET_DIR.iterdir():
#     if label_dir.is_dir():
#         label = label_dir.name
#         files = list(label_dir.glob("*.png"))
#         random.shuffle(files)

#         # Calculate split index
#         split_idx = int(len(files) * (1 - test_ratio))
#         train_files = files[:split_idx]
#         test_files = files[split_idx:]

#         # Create label subdirectories
#         (TRAIN_DIR / label).mkdir(exist_ok=True)
#         (TEST_DIR / label).mkdir(exist_ok=True)

#         # Copy files
#         for f in train_files:
#             shutil.copy(f, TRAIN_DIR / label / f.name)

#         for f in test_files:
#             shutil.copy(f, TEST_DIR / label / f.name)

# print("✅ Done splitting dataset.")

In [3]:
config = {
    "use_aug" : False,
    "num_classes" : 264,
    "batch_size" : 64,
    "epochs" : 1,
    "PRECISION" : 16,
    "PATIENCE" : 8,    
    "seed" : 64,
    "model" : "tf_efficientnet_b1_ns",
    "pretrained" : True,            
    "weight_decay" : 1e-3,
    "use_mixup" : True,
    "mixup_alpha" : 0.6,  

    "train_images" : "/mnt/Stuff/phd_projects/birdclef-2023/splitted_dataset/train",
    "valid_images" : "/mnt/Stuff/phd_projects/birdclef-2023/splitted_dataset/test",
    "train_path" : "/kaggle/input/bc2023-train-val-df/train.csv",
    "valid_path" : "/kaggle/input/bc2023-train-val-df/valid.csv",
    
    "SR" : 32000,
    "DURATION" : 5,
    "MAX_READ_SAMPLES" : 5,
    "LR" : 1e-3
}

In [4]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset

class BirdSpectrogramDataset(Dataset):
    def __init__(self, spectrogram_dir, transform=None):
        """
        spectrogram_dir: root directory with folders per common name
        transform: torchvision transforms to apply to images
        """
        self.spectrogram_dir = spectrogram_dir
        self.transform = transform

        # Build class index
        self.classes = sorted(os.listdir(spectrogram_dir))
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        # Build file list
        self.samples = []
        for cls_name in self.classes:
            folder = os.path.join(spectrogram_dir, cls_name)
            for fname in os.listdir(folder):
                if fname.endswith(".png"):
                    self.samples.append({
                        "path": os.path.join(folder, fname),
                        "label": cls_name
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        img = Image.open(sample["path"]).convert("L")
        label_name = sample["label"]
        label_idx = self.class_to_idx[label_name]

        if self.transform:
            img = self.transform(img)

        return img, label_idx


In [5]:
def generate_label_map(root_dir):
    """
    Generates a mapping from string labels to integer indices
    based on the subfolder names in root_dir.
    """
    # Get all subfolders (each represents a class)
    labels = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
    
    # Create the mapping
    label_to_idx = {label: idx for idx, label in enumerate(labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    
    return label_to_idx, idx_to_label

# Example usage
root_dir = "spectrogram_dataset"  # folder where each class has its own subfolder
label_to_idx, idx_to_label = generate_label_map(root_dir)

print("Label to index mapping:")
print(label_to_idx)

Label to index mapping:
{'Abyssinian Thrush': 0, 'African Bare-eyed Thrush': 1, 'African Black-headed Oriole': 2, 'African Blue Flycatcher': 3, 'African Darter': 4, 'African Dusky Flycatcher': 5, 'African Emerald Cuckoo': 6, 'African Fish-Eagle': 7, 'African Goshawk': 8, 'African Gray Flycatcher': 9, 'African Gray Hornbill': 10, 'African Green-Pigeon': 11, 'African Jacana': 12, 'African Paradise-Flycatcher': 13, 'African Pied Wagtail': 14, 'African Pygmy Kingfisher': 15, 'African Sacred Ibis': 16, 'African Thrush': 17, 'Amethyst Sunbird': 18, 'Augur Buzzard': 19, 'Baglafecht Weaver': 20, 'Barn Swallow': 21, 'Beautiful Sunbird': 22, 'Black Crake': 23, 'Black Cuckoo': 24, 'Black Kite': 25, 'Black Sawwing': 26, 'Black-and-white Mannikin': 27, 'Black-and-white-casqued Hornbill': 28, 'Black-backed Puffback': 29, 'Black-collared Apalis': 30, 'Black-crowned Tchagra': 31, 'Black-faced Rufous-Warbler': 32, 'Black-fronted Bushshrike': 33, 'Black-headed Gonolek': 34, 'Black-headed Heron': 35, 'Bl

In [6]:
from torch.utils.data import DataLoader
from torchvision import transforms

audio_dir = "/mnt/Stuff/phd_projects/birdclef-2023/splitted_dataset/train"
metadata_csv = "/mnt/Stuff/phd_projects/birdclef-2023/train_metadata.csv"
train_dir = "/mnt/Stuff/phd_projects/birdclef-2023/splitted_dataset/train"
test_dir = "/mnt/Stuff/phd_projects/birdclef-2023/splitted_dataset/test"

# Step 1: Preprocess & save spectrograms
# save_mel_spectrograms(audio_dir, metadata_csv, spectrogram_dir)

# Step 2: Create dataset and dataloader
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # for CNN input
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

train_dataset = BirdSpectrogramDataset(train_dir, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = BirdSpectrogramDataset(test_dir, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [7]:
import sklearn.metrics
import pandas as pd

def padded_cmap(solution, submission, padding_factor=5):
    solution = solution#.drop(['row_id'], axis=1, errors='ignore')
    submission = submission#.drop(['row_id'], axis=1, errors='ignore')
    new_rows = []
    for i in range(padding_factor):
        new_rows.append([1 for i in range(len(solution.columns))])
    new_rows = pd.DataFrame(new_rows)
    new_rows.columns = solution.columns
    padded_solution = pd.concat([solution, new_rows]).reset_index(drop=True).copy()
    padded_submission = pd.concat([submission, new_rows]).reset_index(drop=True).copy()
    score = sklearn.metrics.average_precision_score(
        padded_solution.values,
        padded_submission.values,
        average='macro',
    )
    return score

def map_score(solution, submission):
    solution = solution#.drop(['row_id'], axis=1, errors='ignore')
    submission = submission#.drop(['row_id'], axis=1, errors='ignore')
    score = sklearn.metrics.average_precision_score(
        solution.values,
        submission.values,
        average='micro',
    )
    return score

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from torchvision.models import efficientnet_b0
from torchvision.models.resnet import resnet18

class BirdClefModel(nn.Module):
    def __init__(self, model_name, num_classes, pretrained=True):
        super().__init__()
        self.num_classes = num_classes

        self.backbone = None
        self.in_features = None

        # Replace classifier head depending on model type
        if 'resnet' in model_name:
            self.backbone = resnet18(num_classes=num_classes)
            self.in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Linear(self.in_features, num_classes)
            
            self.backbone.bn1 = nn.BatchNorm2d(64)
            self.backbone.conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        elif 'efficientnet' in model_name:
            self.backbone = efficientnet_b0(num_classes=num_classes)
            self.in_features = 1000

            self.backbone.features[0][0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        else:
            raise ValueError(f"Unsupported model type for: {model_name}")
        
        if self.in_features is None:    
            raise ValueError("In Features cannot be None")

    def forward(self, x):
        x = self.backbone(x)
        return x

In [9]:
class FakeQuantize(nn.Module):
    def __init__(self, num_bits=8):
        super().__init__()
        self.num_bits = num_bits

    def forward(self, x):
        # Calculate min and max per tensor
        x_min = x.min()
        x_max = x.max()
        
        qmin = 0
        qmax = 2 ** self.num_bits - 1

        # Scale and zero point
        scale = (x_max - x_min) / (qmax - qmin + 1e-8)
        zero_point = qmin - x_min / (scale + 1e-8)
        zero_point = zero_point.round().clamp(qmin, qmax)

        # Quantize-dequantize
        q_x = ((x / scale + zero_point).round().clamp(qmin, qmax) - zero_point) * scale
        return q_x

In [10]:
class QuantizedLayer(nn.Module):
    def __init__(self, layer, num_bits=8):
        super().__init__()
        self.layer = layer
        self.fake_quant = FakeQuantize(num_bits)

    def forward(self, x):
        # Fake-quantize input
        x = self.fake_quant(x)
        # Apply layer
        x = self.layer(x)
        # Fake-quantize output
        x = self.fake_quant(x)
        return x

In [11]:
def apply_fake_quant(module, num_bits=8):
    """
    Recursively replaces Conv2d and Linear layers with quantized versions.
    """
    for name, child in module.named_children():
        if isinstance(child, nn.Conv2d) or isinstance(child, nn.Linear):
            # Wrap original layer with fake quant for weights and activations
            setattr(module, name, QuantizedLayer(child, num_bits=num_bits))
        else:
            apply_fake_quant(child, num_bits)

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BirdClefModel("efficientnet", 264).to(device)

apply_fake_quant(model, num_bits=8)

In [13]:
# from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
#   get_symmetric_quantization_config,
#   XNNPACKQuantizer,
# )
# from torchao.quantization.pt2e.quantize_pt2e import (
#   prepare_qat_pt2e,
#   convert_pt2e,
#   prepare_pt2e
# )

# torch.save(model.state_dict(), 'orig_model.pth')

# model.eval()

# quantizer = XNNPACKQuantizer()

# sample_inputs = (torch.randn(2, 1, 224, 224).to(device),)

# exported_model = torch.export.export( 
#     model, 
#     sample_inputs).module()

# quantizer.set_global(get_symmetric_quantization_config(is_qat=True))

# model = prepare_qat_pt2e(exported_model, quantizer)

# torchao.quantization.pt2e.move_exported_model_to_eval(model)

# quantizer.set_global(get_symmetric_quantization_config())
# model = prepare_pt2e(exported_model, quantizer)
# quantized = convert_pt2e(model)

# torch.save(quantized.state_dict(), 'best_model_qauantized.pth')

In [14]:
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=config['LR'],
            weight_decay=config['weight_decay']
        )
    
# lr_scheduler = CosineAnnealingWarmRestarts(
#                     optimizer, 
#                     T_0=config['epochs'], 
#                     T_mult=1, 
#                     eta_min=1e-6, 
#                     last_epoch=-1
#                 )

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.2)

birds = list(label_to_idx.keys())

In [15]:
train_acc_history = []
val_acc_history = []
train_losses = []
val_losses = []

val_df = pd.DataFrame(columns=birds)
pred_df = pd.DataFrame(columns=birds)

for epoch in range(config['epochs']):
    # =========================== TRAINING =========================== #
    model.train()
    total_train_loss, total_train_acc, n_train_batches = 0.0, 0.0, 0
    print(f'Epoch number: {epoch}')

    for images, targets in tqdm(train_dataloader, desc="Training"):
        images, targets = images.to(device), targets.to(device)

        optimizer.zero_grad()

        logits = model(images)

        # print("TRAIN")
        # print(logits)

        loss = criterion(logits, targets)

        loss.backward()
        optimizer.step()

        # accuracy
        preds = torch.argmax(logits, dim=1)
        acc = (preds == targets).float().mean()

        total_train_loss += loss.item()
        total_train_acc += acc.item()
        n_train_batches += 1
        

    avg_loss = total_train_loss / n_train_batches
    avg_acc = total_train_acc / n_train_batches
    train_acc_history.append(avg_acc)
    train_losses.append(avg_loss)

    print(f"Train Loss: {avg_loss:.4f} | Train Acc: {avg_acc:.4f}")

    # =========================== VALIDATION =========================== #
    model.eval()
    
    total_val_loss, total_val_acc, n_val_batches = 0.0, 0.0, 0
    all_logits, all_targets = [], []

    with torch.no_grad():
        for images, targets in tqdm(test_dataloader, desc="Validating"):
            images, targets = images.to(device), targets.to(device)
            logits = model(images)

            # print("VALIDATION")
            # print(logits)
                
            loss = criterion(logits, targets)

            preds = torch.argmax(logits, dim=1)
            acc = (preds == targets).float().mean()

            total_val_loss += loss.item()
            total_val_acc += acc.item()
            n_val_batches += 1

        avg_loss = total_val_loss / n_val_batches
        avg_acc = total_val_acc / n_val_batches
        val_acc_history.append(avg_acc)
        val_losses.append(avg_loss)

        print(f"Val Loss: {avg_loss:.4f} | Val Acc: {avg_acc:.4f}")

Epoch number: 0


Training: 100%|██████████| 841/841 [05:43<00:00,  2.45it/s]


Train Loss: 5.6591 | Train Acc: 0.0039


Validating: 100%|██████████| 219/219 [00:40<00:00,  5.38it/s]

Val Loss: 6.3838 | Val Acc: 0.0037





In [16]:
orig_model = BirdClefModel("efficientnet", 264)
torch.save(orig_model.state_dict(), 'orig_model.pth')

torch.save(model.state_dict(), 'qat_model.pth')

In [19]:
# from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
#     get_symmetric_quantization_config,
#     XNNPACKQuantizer,
# )

# from torchao.quantization.pt2e.quantize_pt2e import (
#   prepare_qat_pt2e,
# )

# example_inputs = (torch.rand(1, 1, 224, 224),)
# float_model = BirdClefModel("efficientnet", 264)
# float_model.eval()

# exported_model = torch.export.export(float_model, example_inputs).module()

# quantizer = XNNPACKQuantizer()
# quantizer.set_global(get_symmetric_quantization_config(is_qat=True))

# prepared_model = prepare_qat_pt2e(exported_model, quantizer)
# prepared_model.load_state_dict(torch.load("best_model_qat1.pth"))
import ai_edge_torch

sample_inputs = (torch.randn(2, 1, 224, 224),)
edge_model = ai_edge_torch.convert(orig_model.eval(), sample_inputs)

INFO:tensorflow:Assets written to: /tmp/tmpmntitsrb/assets


INFO:tensorflow:Assets written to: /tmp/tmpmntitsrb/assets
W0000 00:00:1760797964.924407    8075 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1760797964.924427    8075 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-10-18 20:32:44.924690: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpmntitsrb
2025-10-18 20:32:44.928897: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-10-18 20:32:44.928906: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpmntitsrb
I0000 00:00:1760797964.963965    8075 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled
2025-10-18 20:32:44.969347: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-10-18 20:32:45.250749: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpmntitsrb
2025-10-18 20:32:45.317

In [20]:
edge_model.export('birdcall_model.tflite')

In [18]:
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training & Validation Losses')
plt.legend()
plt.savefig('train_val_loss_plot.png')
plt.close()

In [19]:
plt.figure(figsize=(10, 6))
plt.plot(train_acc_history, label='Training Accuracy')
plt.plot(val_acc_history, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training & Validation Accuracy')
plt.ylim(0, 1)
plt.legend()
plt.savefig('accuracy_plot.png')
plt.close()

In [14]:
from executorch.exir import to_edge_transform_and_lower
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner

from torchao.quantization.pt2e.quantize_pt2e import (
  convert_pt2e,
)

quantized_model = convert_pt2e(prepared_model)

sample_inputs = (torch.randn(2, 1, 224, 224),)

et_program = to_edge_transform_and_lower( # (6)
    torch.export.export(quantized_model, sample_inputs),
    partitioner=[XnnpackPartitioner()],
).to_executorch()

# 3. Save for deployment
with open("model.pte", "wb") as f:
    f.write(et_program.buffer)

