In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
import torchvision
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import os

import math, random


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed = 0
seed_everything(seed)

In [4]:
class CFG:
    isOneHot = False
    rate = 32000
    num_classes = 264

In [6]:
class AudioUtil():
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)

  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud
    if (sig.shape[0] == new_channel):
      return aud

    if (new_channel == 1):
      resig = sig[:1, :]
    else:
      resig = torch.cat([sig, sig, sig])
    return ((resig, sr))

  @staticmethod
  def resample(aud, newsr):
    sig, sr = aud
    if (sr == newsr):
      return aud
    num_channels = sig.shape[0]
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))

  @staticmethod
  def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)

  @staticmethod
  def time_shift(aud, shift_limit):
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80
    spec = torchaudio.transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = torchaudio.transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = torchaudio.transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = torchaudio.transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [7]:
meta_df = pd.read_csv('/birdclef-2023/train_metadata.csv')
print('data shape:',meta_df.shape)
meta_df.head()

data shape: (16941, 12)


Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg


In [9]:
print('Number of classes and number samples in each class:',meta_df.primary_label.value_counts().reset_index().shape)
print('Number of classes with more than 1 sample:',meta_df.primary_label.value_counts().reset_index()\
                                                              .query('primary_label > 1').shape)

Number of classes and number samples in each class: (264, 2)
Number of classes with more than 1 sample: (257, 2)


In [10]:
if CFG.isOneHot:
    label = OneHotEncoder(sparse=False)\
                                    .fit_transform(meta_df['primary_label'].to_numpy().reshape(-1,1))
    meta_df['label'] = pd.DataFrame(label).apply(lambda x: list(x), axis = 1)
else:
    meta_df['label'] = LabelEncoder().fit_transform(meta_df['primary_label'])
meta_df.head(2)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,label
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,0
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,0


In [11]:
def cv_split(Xtrain, ytrain, n_folds, seed):
    kfold = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for num, (train_index, val_index) in enumerate(kfold.split(Xtrain, ytrain)):
        Xtrain.loc[val_index, 'fold'] = int(num)
    Xtrain['fold'] = Xtrain['fold'].astype(int)
    return Xtrain
meta_df = cv_split(meta_df,meta_df['primary_label'], 5, 42)
meta_df.head(6)






Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,label,fold
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,0,3
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,0,3
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,0,0
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,0,4
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,0,1
5,abethr1,['rbsrob1'],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/379322,abethr1/XC379322.ogg,0,2


In [12]:
class SoundDS(Dataset):
  def __init__(self, df, data_path ='/birdclef-2023/train_audio/', mode='train'):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 8000
    self.sr = 32000
    self.channel = 3
    self.shift_pct = 0.4
    self.mode = mode
    
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):
    audio_file = self.data_path + self.df.loc[idx, 'filename']
    class_id = np.array(self.df.loc[idx, 'label'])
    aud = AudioUtil.open(audio_file)
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)
    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    if self.mode == 'train':
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return aug_sgram, class_id
    else:
        shift_aud = dur_aud
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        return sgram, class_id
        

In [13]:
train_ds = SoundDS(meta_df[meta_df.fold != 0].reset_index(), mode='train')
test_ds = SoundDS(meta_df[meta_df.fold == 0].reset_index(), mode='test')
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=True)

print('train samples ', meta_df[meta_df.fold != 0].shape)
print('test samples ', meta_df[meta_df.fold == 0].shape)

train samples  (13552, 14)
test samples  (3389, 14)


In [14]:
a,b = next(iter(train_dl))
print(a.shape)
print(b.shape)

torch.Size([16, 3, 64, 501])
torch.Size([16])


# 5. Import Backbone Pretrained Model

In [15]:
model = torchvision.models.efficientnet_b4(weights=torchvision.models.EfficientNet_B4_Weights.DEFAULT, progress=True)

model.avgpool = nn.AdaptiveAvgPool2d(output_size=(1,1))
model.classifier = nn.Sequential(nn.Flatten(),
                                 nn.Linear(1792, 256),
                                 nn.ReLU(),
                                 nn.Dropout(0.3),
                                 nn.Linear(256, CFG.num_classes),
                                )
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = model.to(device)
next(myModel.parameters()).device

Downloading: "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_rwightman-7eb33cd5.pth


  0%|          | 0.00/74.5M [00:00<?, ?B/s]

device(type='cuda', index=0)

# 6. Start Training 

In [16]:

def training(model, train_dl, num_epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  best_acc = -1
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    for i, data in enumerate(train_dl):
        inputs, labels = data[0].to(device), data[1].to(device)
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
        _, prediction = torch.max(outputs,1)
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        if (i + 1) % 1 == 0:   
           print('Epoch [{}/{}], Step [{}/{}], Loss : {:.4f}'
            .format(epoch + 1, num_epochs, i + 1, len(train_dl), running_loss/(i + 1)))
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}')

    if (best_acc < acc):
        best_acc = acc
        print("Saving best model!")
        torch.save(model.state_dict(), f'BirdSound_EfficientNet_B4_epoch{epoch}.pth')

  print('Finished Training')
  
num_epochs = 15   
training(myModel, train_dl, num_epochs)

Epoch [1/15], Step [1/847], Loss : 5.5790
Epoch [1/15], Step [2/847], Loss : 5.5837
Epoch [1/15], Step [3/847], Loss : 5.5873
Epoch [1/15], Step [4/847], Loss : 5.5949
Epoch [1/15], Step [5/847], Loss : 5.5852
Epoch [1/15], Step [6/847], Loss : 5.5875
Epoch [1/15], Step [7/847], Loss : 5.5842
Epoch [1/15], Step [8/847], Loss : 5.5825
Epoch [1/15], Step [9/847], Loss : 5.5806
Epoch [1/15], Step [10/847], Loss : 5.5792
Epoch [1/15], Step [11/847], Loss : 5.5765
Epoch [1/15], Step [12/847], Loss : 5.5713
Epoch [1/15], Step [13/847], Loss : 5.5687
Epoch [1/15], Step [14/847], Loss : 5.5657
Epoch [1/15], Step [15/847], Loss : 5.5598
Epoch [1/15], Step [16/847], Loss : 5.5600
Epoch [1/15], Step [17/847], Loss : 5.5561
Epoch [1/15], Step [18/847], Loss : 5.5521
Epoch [1/15], Step [19/847], Loss : 5.5467
Epoch [1/15], Step [20/847], Loss : 5.5462
Epoch [1/15], Step [21/847], Loss : 5.5427
Epoch [1/15], Step [22/847], Loss : 5.5418
Epoch [1/15], Step [23/847], Loss : 5.5385
Epoch [1/15], Step [

In [21]:
from sklearn.metrics import precision_score, recall_score, average_precision_score, f1_score
def testing(model, test_dl):
    criterion = nn.CrossEntropyLoss()
    gt = torch.tensor([])
    pred = torch.tensor([])
    gt = gt.to(device)
    pred = pred.to(device)
    with torch.no_grad():
        correct = 0
        total = 0
        val_loss = 0 
        for idx, data_ in enumerate(test_dl):
            inputs, labels = data_[0].to(device), data_[1].to(device)

            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            max_values, prediction = torch.max(outputs,1)
            
            predi = torch.softmax(outputs, dim = -1)
        
            gt = torch.cat((gt, labels), dim = 0)

            pred = torch.cat((pred, prediction), dim = 0)
            correct += (prediction == labels).sum().item()
            total += prediction.shape[0]

        gt = gt.to('cpu')
        pred = pred.to('cpu')
        
        precision = precision_score(gt, pred, average='weighted',  zero_division=0)
        recall = recall_score(gt, pred, average='weighted', zero_division=0)
        f1 = f1_score(gt, pred, average='weighted', zero_division=0)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)        
        final_score = 100 * correct / total
        print("Final Test Accuracy", final_score)