## About This Notebook
- When run not as a submission to Kaggle, it takes the twenty `.ogg` files in `train_soundscapes/` to evaluate

# Notes

In this kenel, I'm going to use a classical **ResneSt50** for bird identification.

* The inference is based on these [resnest50 weights](https://www.kaggle.com/kneroma/kkiller-birdclef-models-public). Please, don't forget upvoting the dataset to make it more visible for others
* The inference pipeline is optimized as much as I can in order to reduce execution time

**If the only thing you wan't to change is `hyperparams`, please consider commenting instead of spamming with stupid forks !**

<h2><font color="blue">If you find this work useful, please don't forget upvoting :)</font></h2>

In [1]:
!which pip

/home/phunc20/.config/miniconda3/envs/torch1.8/bin/pip


In [2]:
have_resnest = !pip list | grep resnest
have_resnest

['resnest             0.0.6b20210508']

In [3]:
if len(have_resnest) == 0:
    print("pip installing ...")
    !pip install -q "./resnest"
else:
    print("resnest already installed")

resnest already installed


In [6]:
import numpy as np
import librosa as lb
import soundfile as sf
import pandas as pd
#import cv2
from pathlib import Path
import re

import torch
from torch import nn
from  torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

import time
from resnest.torch import resnest50

# Configs

In [7]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

DEVICE: cuda


In [8]:
PATH_DATASET = Path("$HOME/datasets/kaggle/birdclef-2021")
PATH_DATASET.exists()

False

In [9]:
Path.home()

PosixPath('/home/phunc20')

In [10]:
PATH_DATASET = Path.home() / "datasets/kaggle/birdclef-2021"
PATH_DATASET.exists()

True

In [11]:
L_birds = [path.name for path
           in (PATH_DATASET / "train_short_audio").iterdir()]
L_birds[:10]

['acafly',
 'acowoo',
 'aldfly',
 'ameavo',
 'amecro',
 'amegfi',
 'amekes',
 'amepip',
 'amered',
 'amerob']

In [12]:
#TEST_AUDIO_ROOT = Path("../input/birdclef-2021/test_soundscapes")
#SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
TEST_AUDIO_ROOT = PATH_DATASET / "test_soundscapes"
SAMPLE_SUB_PATH = PATH_DATASET / "sample_submission.csv"
TARGET_PATH = None

In [13]:
#NUM_CLASSES = 397
NUM_CLASSES = len(L_birds)
print(f"NUM_CLASSES = {NUM_CLASSES}")
SR = 32_000
DURATION = 5
THRESH = 0.25

NUM_CLASSES = 397


In [14]:
TEST_AUDIO_ROOT.glob("*.ogg")

<generator object Path.glob at 0x7f270fc2f950>

If nothing is in this glob, then just change `TEST_AUDIO_ROOT` to `train_soundscapes/`

In [35]:
if not len(list(TEST_AUDIO_ROOT.glob("*.ogg"))):
    print("No test ogg files: TEST_AUDIO_ROOT set to train_soundscapes/")
    #TEST_AUDIO_ROOT = Path("../input/birdclef-2021/train_soundscapes")
    TEST_AUDIO_ROOT = PATH_DATASET/"train_soundscapes"
    SAMPLE_SUB_PATH = None
    # SAMPLE_SUB_PATH = "../input/birdclef-2021/sample_submission.csv"
    #TARGET_PATH = Path("../input/birdclef-2021/train_soundscape_labels.csv")
    TARGET_PATH = PATH_DATASET/"train_soundscape_labels.csv"

No test ogg files: TEST_AUDIO_ROOT set to train_soundscapes/


Stopped here 2021/05/07 (金)

# Data

In [20]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

**(?)** Why the default `n_fft` and `hop_length` are `self.sr//10` and `self.sr//(10*4)`?

In [21]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, length - np.zeros(len(y))])
    elif len(y) > length:
        y = y[:length]
    return y

In [22]:
class BirdCLEFDataset(Dataset):
    def __init__(self,
                 data,
                 sr=SR,
                 n_mels=128,
                 fmin=0,
                 fmax=None,
                 duration=DURATION,
                 step=None,
                 res_type="kaiser_fast",
                 resample=True):
        
        self.data = data

        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr,
                                                 n_mels=self.n_mels,
                                                 fmin=self.fmin,
                                                 fmax=self.fmax)
    def __len__(self):
        return len(self.data)
    
    @staticmethod
    def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image
    
    def audio_to_image(self, audio):
        melspec = self.mel_spec_computer(audio) 
        image = mono_to_color(melspec)
        image = self.normalize(image)
        return image

    def read_file(self, filepath):
        audio, orig_sr = sf.read(filepath, dtype="float32")

        if self.resample and orig_sr != self.sr:
            audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)

        # For the same audio, we take only a clip of it of n_samples
        # equal to self.audio_length; each time shifting one sample,
        # we can obtain many audios, to be stored in the list `audios`
        audios = []
        for i in range(self.audio_length, len(audio) + self.step, self.step):
            start = max(0, i - self.audio_length)
            end = start + self.audio_length
            audios.append(audio[start:end])
            
        if len(audios[-1]) < self.audio_length:
            audios = audios[:-1]
            
        images = [self.audio_to_image(audio) for audio in audios]
        images = np.stack(images)
        
        return images
    
        
    def __getitem__(self, idx):
        return self.read_file(self.data.loc[idx, "filepath"])

**(?)** By default, how long was `self.audio_length`?<br>
**(R)** By default, `self.audio_length = DURATION * SR`, i.e. `5` seconds with sampling rate `32_000` (samples/sec).

In [23]:
DURATION * SR, DURATION

(160000, 5)

In [36]:
data = pd.DataFrame(
    [(path.stem, *path.stem.split("_"), path) for path
     in Path(TEST_AUDIO_ROOT).glob("*.ogg")],
    columns = ["filename", "id", "site", "date", "filepath"]
)
print(data.shape)
data.head()

(20, 5)


Unnamed: 0,filename,id,site,date,filepath
0,10534_SSW_20170429,10534,SSW,20170429,/home/phunc20/datasets/kaggle/birdclef-2021/tr...
1,11254_COR_20190904,11254,COR,20190904,/home/phunc20/datasets/kaggle/birdclef-2021/tr...
2,14473_SSW_20170701,14473,SSW,20170701,/home/phunc20/datasets/kaggle/birdclef-2021/tr...
3,18003_COR_20190904,18003,COR,20190904,/home/phunc20/datasets/kaggle/birdclef-2021/tr...
4,20152_SSW_20170805,20152,SSW,20170805,/home/phunc20/datasets/kaggle/birdclef-2021/tr...


In [26]:
#df_train = pd.read_csv("../input/birdclef-2021/train_metadata.csv")
df_train = pd.read_csv(PATH_DATASET/"train_metadata.csv")

LABEL_IDS = {label: label_id for label_id, label
             in enumerate(sorted(df_train["primary_label"].unique()))}
INV_LABEL_IDS = {val: key for key,val in LABEL_IDS.items()}
LABEL_IDS

{'acafly': 0,
 'acowoo': 1,
 'aldfly': 2,
 'ameavo': 3,
 'amecro': 4,
 'amegfi': 5,
 'amekes': 6,
 'amepip': 7,
 'amered': 8,
 'amerob': 9,
 'amewig': 10,
 'amtspa': 11,
 'andsol1': 12,
 'annhum': 13,
 'astfly': 14,
 'azaspi1': 15,
 'babwar': 16,
 'baleag': 17,
 'balori': 18,
 'banana': 19,
 'banswa': 20,
 'banwre1': 21,
 'barant1': 22,
 'barswa': 23,
 'batpig1': 24,
 'bawswa1': 25,
 'bawwar': 26,
 'baywre1': 27,
 'bbwduc': 28,
 'bcnher': 29,
 'belkin1': 30,
 'belvir': 31,
 'bewwre': 32,
 'bkbmag1': 33,
 'bkbplo': 34,
 'bkbwar': 35,
 'bkcchi': 36,
 'bkhgro': 37,
 'bkmtou1': 38,
 'bknsti': 39,
 'blbgra1': 40,
 'blbthr1': 41,
 'blcjay1': 42,
 'blctan1': 43,
 'blhpar1': 44,
 'blkpho': 45,
 'blsspa1': 46,
 'blugrb1': 47,
 'blujay': 48,
 'bncfly': 49,
 'bnhcow': 50,
 'bobfly1': 51,
 'bongul': 52,
 'botgra': 53,
 'brbmot1': 54,
 'brbsol1': 55,
 'brcvir1': 56,
 'brebla': 57,
 'brncre': 58,
 'brnjay': 59,
 'brnthr': 60,
 'brratt1': 61,
 'brwhaw': 62,
 'brwpar1': 63,
 'btbwar': 64,
 'btnwar': 6

In [28]:
sorted(L_birds) == L_birds

True

In [32]:
df_train.iloc[np.random.choice(len(df_train), 5)]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url
20502,easmea,[],"['call', 'song']",36.8461,-96.4229,Sturnella magna,Eastern Meadowlark,Paul Marvin,2013-01-26,XC164065.ogg,Creative Commons Attribution-NonCommercial-Sha...,5.0,09:36,https://www.xeno-canto.org/164065
32301,linwoo1,[],"['male', 'song']",-16.5631,-49.285,Dryocopus lineatus,Lineated Woodpecker,JAYRSON ARAUJO DE OLIVEIRA,2020-08-14,XC591264.ogg,Creative Commons Attribution-NonCommercial-Sha...,5.0,16:05,https://www.xeno-canto.org/591264
54678,tropew1,[],['song'],8.9389,-78.4651,Contopus cinereus,Tropical Pewee,Mike Nelson,2009-01-18,XC47009.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,8:45am,https://www.xeno-canto.org/47009
1340,amered,[],['song'],51.7942,-117.3863,Setophaga ruticilla,American Redstart,James Bradley,2016-05-28,XC325262.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,14:00,https://www.xeno-canto.org/325262
41266,purgal2,['botgra'],['call'],29.951,-93.079,Porphyrio martinica,Purple Gallinule,Dan Lane,2017-04-13,XC368525.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:00,https://www.xeno-canto.org/368525


# Inference

In [37]:
test_data = BirdCLEFDataset(data=data)
len(test_data), test_data[0].shape

(20, (120, 3, 128, 201))

In [38]:
def load_net(checkpoint_path, num_classes=NUM_CLASSES):
    net = resnest50(pretrained=False)
    net.fc = nn.Linear(net.fc.in_features, num_classes)
    dummy_device = torch.device("cpu")
    d = torch.load(checkpoint_path, map_location=dummy_device)
    for key in list(d.keys()):  # list() necessary?
        d[key.replace("model.", "")] = d.pop(key)
    net.load_state_dict(d)
    net = net.to(DEVICE)
    net = net.eval()
    return net

In [39]:
checkpoint_paths = [
    Path("../weights/birdclef_resnest50_fold0_epoch_10_f1_val_06471_20210417161101.pth"),
]

nets = [
    load_net(checkpoint_path.as_posix()) for checkpoint_path
    in checkpoint_paths
]

In [42]:
p = Path.cwd()
type(p), type(p.as_posix()), p.as_posix()

(pathlib.PosixPath,
 str,
 '/home/phunc20/git-repos/phunc20/competitions/kaggle/birdclef_2021/others_code/kkiller/infer')

In [44]:
p._str

'/home/phunc20/git-repos/phunc20/competitions/kaggle/birdclef_2021/others_code/kkiller/infer'

In [93]:
@torch.no_grad()
def get_thresh_preds(out, thresh=None):
    thresh = thresh or THRESH
    o = (-out).argsort(1)
    npreds = (out > thresh).sum(1)
    #print(f"o[:, :npreds] = {o[:, :npreds]}")
    preds = []
    for oo, npred in zip(o, npreds):
        preds.append(oo[:npred].cpu().numpy().tolist())
    return preds

Because `out` can be a batched output of the model, so for each row the number `npreds` might not equal and I think that why the author used that `zip` in the for loop. That is,
> `preds` is a list of lists containing the indices whose values `> thresh` and in sorted order (Descending)

In [94]:
get_thresh_preds(torch.tensor([[0.9, 0.1, 0.05, 0.25, 0.5],
                               [0.1, 0.1, 0.2, 0.3, 0.7]]),
                 thresh=0.09)

[[0, 4, 3, 1], [4, 3, 2, 0, 1]]

**(?)** What's the input arg `out`? And `-out`? Reverse the sign to reverse the sorting result's order?

In [84]:
a = np.arange(10, 0, -2)
print(f"a = {a}")
a.argsort()

a = [10  8  6  4  2]


array([4, 3, 2, 1, 0])

In [85]:
a = np.arange(0,10,2)
print(f"a = {a}")
a.argsort()

a = [0 2 4 6 8]


array([0, 1, 2, 3, 4])

In [86]:
np.arange(10).argsort

<function ndarray.argsort>

In [87]:
30 or 1

30

In [88]:
30 or 100

30

Probably nothing more beneficial than trying to run this function `get_thresh_preds` if one searches to understand what it does.

In [89]:
get_thresh_preds(torch.tensor([[0.9, 0.1, 0.05, 0.25, 0.5]]),
                 thresh=0.09)

o[:npreds] = tensor([[0, 4, 3, 1, 2]])


[[0, 4, 3, 1]]

In [49]:
def get_bird_names(preds):
    bird_names = []
    for pred in preds:
        if not pred:
            bird_names.append("nocall")
        else:
            bird_names.append(" ".join([INV_LABEL_IDS[bird_id] for bird_id in pred]))
    return bird_names

In [51]:
def predict(nets, test_data, names=True):
    preds = []
    with torch.no_grad():
        for idx in  tqdm(list(range(len(test_data)))):
            # maybe don't need the list() in prev line
            xb = torch.from_numpy(test_data[idx]).to(DEVICE)
            pred = 0.
            for net in nets:
                o = net(xb)
                o = torch.sigmoid(o)
                pred += o

            pred /= len(nets)
            
            if names:
                pred = get_bird_names(get_thresh_preds(pred))

            preds.append(pred)
    return preds

In [52]:
pred_probas = predict(nets, test_data, names=False)
print(len(pred_probas))

  0%|          | 0/20 [00:00<?, ?it/s]

20


In [53]:
preds = [get_bird_names(get_thresh_preds(pred, thresh=THRESH)) for pred in pred_probas]
preds[:10]

[['nocall',
  'swaspa',
  'nocall',
  'nocall',
  'swaspa',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'bkcchi',
  'dowwoo',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'bkcchi',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'bkcchi',
  'bkcchi',
  'nocall',
  'nocall',
  'bkcchi',
  'bkcchi',
  'bkcchi',
  'nocall',
  'nocall',
  'herthr',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'tuftit',
  'nocall',
  'nocall',
  'bkcchi',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'tuftit',
  'nocall',
  'rewbla',
  'nocall',
  'nocall',
  'nocall',
  'dowwoo',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'nocall',
  'n

In [102]:
SAMPLE_SUB_PATH

In [95]:
def preds_as_df(data, preds):
    sub = {
        "row_id": [],
        "birds": [],
    }
    
    for row, pred in zip(data.itertuples(False), preds):
        row_id = [f"{row.id}_{row.site}_{5*i}" for i in range(1, len(pred)+1)]
        sub["birds"] += pred
        sub["row_id"] += row_id
        
    sub = pd.DataFrame(sub)
    
    if SAMPLE_SUB_PATH:
        sample_sub = pd.read_csv(SAMPLE_SUB_PATH, usecols=["row_id"])
        sub = sample_sub.merge(sub, on="row_id", how="left")
        sub["birds"] = sub["birds"].fillna("nocall")
    return sub

In [96]:
sub = preds_as_df(data, preds)
print(sub.shape)
sub

(2400, 2)


Unnamed: 0,row_id,birds
0,10534_SSW_5,nocall
1,10534_SSW_10,swaspa
2,10534_SSW_15,nocall
3,10534_SSW_20,nocall
4,10534_SSW_25,swaspa
...,...,...
2395,7954_COR_580,nocall
2396,7954_COR_585,nocall
2397,7954_COR_590,nocall
2398,7954_COR_595,nocall


In [None]:
sub.to_csv("submission.csv", index=False)

# Small validation

In [97]:
def get_metrics(s_true, s_pred):
    """
    arg
      s_true
        e.g. "acafly herthr stejay"
      s_pred
        e.g. "nocall"
    """
    s_true = set(s_true.split())
    s_pred = set(s_pred.split())
    n, n_true, n_pred = len(s_true.intersection(s_pred)), 
                        len(s_true),
                        len(s_pred)
    
    prec = n/n_pred
    rec = n/n_true
    f1 = 2*prec*rec/(prec + rec) if prec + rec else 0
    
    return {"f1": f1, "prec": prec, "rec": rec, "n_true": n_true, "n_pred": n_pred, "n": n}

In [98]:
TARGET_PATH

PosixPath('/home/phunc20/datasets/kaggle/birdclef-2021/train_soundscape_labels.csv')

In [99]:
if TARGET_PATH:
    sub_target = pd.read_csv(TARGET_PATH)
    sub_target = sub_target.merge(sub, how="left", on="row_id")

    # When merged, columns with colliding names will become,
    # e.g. "birds" => "birds_x" and "birds_y", where in this particualr
    # example, x means from sub_target
    #          y means from sub
    print(sub_target["birds_x"].notnull().sum(), sub_target["birds_x"].notnull().sum())
    assert sub_target["birds_x"].notnull().all()
    assert sub_target["birds_y"].notnull().all()
    
    df_metrics = pd.DataFrame([get_metrics(s_true, s_pred) for s_true, s_pred in zip(sub_target.birds_x, sub_target.birds_y)])
    
    print(df_metrics.mean())

2400 2400
f1        0.685153
prec      0.693333
rec       0.681576
n_true    1.130000
n_pred    1.002083
n         0.694583
dtype: float64


In [100]:
sub_target[sub_target.birds_y != "nocall"]

Unnamed: 0,row_id,site,audio_id,seconds,birds_x,birds_y
12,7019_COR_65,COR,7019,65,nocall,bucmot2
77,7019_COR_390,COR,7019,390,nocall,whtspa
241,11254_COR_10,COR,11254,10,nocall,wbwwre1
242,11254_COR_15,COR,11254,15,rubwre1,wbwwre1
244,11254_COR_25,COR,11254,25,rubwre1,rubwre1
...,...,...,...,...,...,...
2267,51010_SSW_540,SSW,51010,540,bkcchi rebwoo,bkcchi
2270,51010_SSW_555,SSW,51010,555,bkcchi,bkcchi
2271,51010_SSW_560,SSW,51010,560,bkcchi,bkcchi
2273,51010_SSW_570,SSW,51010,570,bkcchi norcar,bkcchi


In [101]:
sub_target[sub_target.birds_x != "nocall"]

Unnamed: 0,row_id,site,audio_id,seconds,birds_x,birds_y
240,11254_COR_5,COR,11254,5,rubwre1,nocall
242,11254_COR_15,COR,11254,15,rubwre1,wbwwre1
244,11254_COR_25,COR,11254,25,rubwre1,rubwre1
267,11254_COR_140,COR,11254,140,obnthr1,nocall
268,11254_COR_145,COR,11254,145,obnthr1,nocall
...,...,...,...,...,...,...
2391,54955_SSW_560,SSW,54955,560,grycat,nocall
2393,54955_SSW_570,SSW,54955,570,grycat,nocall
2394,54955_SSW_575,SSW,54955,575,chswar,nocall
2396,54955_SSW_585,SSW,54955,585,grycat,nocall
