In [None]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

In [None]:
!ls -la ~/.kaggle

In [None]:
!kaggle competitions download -c birdclef-2022

In [None]:
!unzip birdclef-2022.zip

# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf

import torchaudio
from torchaudio import transforms
import torchvision

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from collections import defaultdict

from sklearn.metrics import plot_confusion_matrix, precision_recall_fscore_support

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
root_dir = '.'
train_df = pd.read_csv(root_dir + '/train_metadata.csv')
train_df.head()

In [None]:
train_audio_dir = root_dir + '/train_audio'

In [None]:
labels = train_df.primary_label.unique().tolist()

In [None]:
label2id = {labels[i]: i for i in range(len(labels))}

There are more number of two channel audios. So we will convert mono (1 channel ) audio to stereo (2 channel) audio by replication.

In [None]:
def MonoToStereo(aud, num_channel=2):
    sig, sr = aud
    if sig.shape[0] == num_channel:
        return aud
    else:
        stereo_sig = torch.cat([sig, sig])
    
    return (stereo_sig, sr)

we need to have signal length to be of same size. So we will either pad the signal or truncate the signal.

In [None]:
# Let's consider the length of all videos to be 10 seconds (A hyperparam - to be tuned)
max_len_ms = 10000 # 10k milliseconds ~ 10 seconds

def pad_signal(aud, max_len_ms):
    sig, sr = aud
    num_channels, sig_len = sig.shape
    max_len = sr // 1000 * max_len_ms
    
    if sig_len > max_len:
        sig = sig[:, :max_len]
    elif sig_len < max_len:
        # padding both sides of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
        
        # pad with zeros
        pad_begin = torch.zeros((num_channels, pad_begin_len))
        pad_end = torch.zeros((num_channels, pad_end_len))
        
        sig = torch.cat((pad_begin, sig, pad_end), 1)
    
    return (sig, sr)

In [None]:
# Data Augmentation on raw audio: Time shift
def time_shift(aud, shift_limit):
    sig, sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

In [None]:
# Convert augmented audio to Mel Spectrogram
def mel_spec(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig, sr = aud
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    # shape of spec: (channels, n_mels, time)
    
    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    
    # Add channel
    spec = torch.cat([spec, spec.mean(dim=0, keepdim=True)])
    return spec

In [None]:
# Data augmentation on mel spectrogram: Time and Frequency Masking
def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec
    
    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        
    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        
    return aug_spec

Preprocessing on one signal sample

# Building Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, audio_dir, df, max_len_ms=10000, shift_limit=0.4, input_size=224):
        self.train_audio_dir = audio_dir
        self.train_df = df
        self.max_len_ms = max_len_ms
        self.shift_limit = shift_limit
        self.input_size = input_size
        
    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, index):
        filename = self.train_df.loc[index, 'filename']
        label = self.train_df.loc[index, 'primary_label']
        
        sig, sr = torchaudio.load(os.path.join(self.train_audio_dir, filename))
        audio = (sig, sr)
        
        ##################        
        # process signal #
        ##################
        
        # mono to stereo (if mono)
        audio = MonoToStereo(audio)

        # Pad or Truncate
        audio = pad_signal(audio, self.max_len_ms)

        # Augment raw audio
        audio = time_shift(audio, shift_limit=self.shift_limit)

        # Convert to Mel Spectrogram
        spec = mel_spec(audio)
        
        # resize Mel Spec
        spec = torchvision.transforms.Resize((self.input_size, self.input_size))(spec)

        # Augment mel spec
        aug_spec = spectro_augment(spec)
        
        return aug_spec, torch.tensor(label2id[label])

In [None]:
max_len_ms = 10000
shift_limit = 0.4
input_size = 224
dataset = CustomDataset(train_audio_dir, train_df, max_len_ms, shift_limit, input_size)

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
inputs, targets = next(iter(train_loader))
inputs.shape, targets.shape

Cool!!

Let's build model architecture.

# Model Building

In [None]:
print(len(labels))

In [None]:
import torchvision
model = torchvision.models.resnet50(num_classes=151)
model

In [None]:
model.fc = torch.nn.Sequential(
    torch.nn.Linear(
        in_features=2048,
        out_features=1
    ),
    torch.nn.Sigmoid()
)

In [None]:
output = model(inputs)
output.shape

In [None]:
LERANING_RATE = 1e-4

# Defining loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LERANING_RATE)

# Model Score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model = model.to('cpu')

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(DEVICE)
model.load_state_dict(torch.load('/content/drive/MyDrive/colab/BirdCLEF/models/best_model.pth.tar'))

In [None]:
device = DEVICE
model.eval()

y_pred = []
y_true = []
with torch.no_grad():
    for batch_idx, (x, y) in enumerate(tqdm(val_loader)):
        x = x.to(device) 
        y = y.to(device) 

        output = model(x)

        _, preds = torch.max(output, dim=1)
        y_pred.extend(preds.cpu())
        y_true.extend(y.cpu())

precision, recall, f1 = precision_recall_fscore_support(y_true, y_pred, average="macro")

print(f"precision: {precision}, recall: {recall}, f1: {f1}")

In [None]:
# y_true = [y.item() for y in y_true]
# y_pred = [y.item() for y in y_pred]

precision, recall, f1,_ = precision_recall_fscore_support(y_true, y_pred, average=None)

pprint(f"precision: {precision}, recall: {recall}, f1: {f1}")

In [None]:
labels.index('skylar')

In [None]:
labels[128]

In [None]:
f1_df = pd.DataFrame(f1)
f1_df

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm)
cm_df

In [None]:
y_true