In [1]:
!cp -r ../input/timm-pytorch-image-models /kaggle/working/
!pip install /kaggle/working/timm-pytorch-image-models/pytorch-image-models-master/
!cp -r ../input/torchlibrosa /kaggle/working/
!pip install /kaggle/working/torchlibrosa/torchlibrosa-0.0.9-py2.py3-none-any.whl

Processing ./timm-pytorch-image-models/pytorch-image-models-master
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: timm
  Building wheel for timm (setup.py) ... [?25l- \ | done
[?25h  Created wheel for timm: filename=timm-0.5.4-py3-none-any.whl size=431993 sha256=c12089f31e48251f821a4511fa4fabc58b97160ace7dc78220e7a462d19edb77
  Stored in directory: /root/.cache/pip/wheels/29/8f/83/790cac9d0753c6ccaef1d2cf55fbd2d3617cea79af844a3f88
Successfully built timm
Installing collected packages: timm
Successfully installed timm-0.5.4
Processing ./torchlibrosa/torchlibrosa-0.0.9-py2.py3-none-any.whl
Installing collected packages: torchlibrosa
Successfully installed torchlibrosa-0.0.9


In [2]:
import re
import cv2
import audioread
import logging
import os
import random
import time
import warnings

import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

from contextlib import contextmanager
from pathlib import Path
from typing import Optional

from albumentations.core.transforms_interface import ImageOnlyTransform
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation
from tqdm import tqdm

In [3]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
    
def get_logger(out_file=None):
    logger = logging.getLogger()
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    logger.handlers = []
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
    logger.info("logger set up")
    return logger
    
    
@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"[{name}] start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"[{name}] done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)

In [4]:
logger = get_logger("main.log")
set_seed(1213)

2022-05-01 05:21:33,741 - INFO - logger set up


## Config

In [5]:
class CFG:
    ######################
    # Dataset #
    ######################
    transforms = {
        "test": [{"name": "Normalize"}]
    }
    period = 20
    n_mels = 128
    fmin = 20
    fmax = 16000
    n_fft = 2048
    hop_length = 512
    sample_rate = 32000
    melspectrogram_parameters = {
        "n_mels": 224,
        "fmin": 2000,
        "fmax": 16000
    }

    target_columns = [
        "afrsil1","akekee","akepa1","akiapo","akikik","amewig","aniani","apapan",
        "arcter","barpet","bcnher","belkin1","bkbplo","bknsti","bkwpet","blkfra",
        "blknod","bongul","brant","brnboo","brnnod","brnowl","brtcur","bubsan",
        "buffle","bulpet","burpar","buwtea","cacgoo1","calqua","cangoo","canvas",
        "caster1","categr","chbsan","chemun","chukar","cintea","comgal1","commyn",
        "compea","comsan","comwax","coopet","crehon","dunlin","elepai","ercfra",
        "eurwig","fragul","gadwal","gamqua","glwgul","gnwtea","golphe","grbher3",
        "grefri","gresca","gryfra","gwfgoo","hawama","hawcoo","hawcre","hawgoo",
        "hawhaw","hawpet1","hoomer","houfin","houspa","hudgod","iiwi","incter1",
        "jabwar","japqua","kalphe","kauama","laugul","layalb","lcspet","leasan",
        "leater1","lessca","lesyel","lobdow","lotjae","madpet","magpet1","mallar3",
        "masboo","mauala","maupar","merlin","mitpar","moudov","norcar","norhar2",
        "normoc","norpin","norsho","nutman","oahama","omao","osprey","pagplo",
        "palila","parjae","pecsan","peflov","perfal","pibgre","pomjae","puaioh",
        "reccar","redava","redjun","redpha1","refboo","rempar","rettro","ribgul",
        "rinduc","rinphe","rocpig","rorpar","rudtur","ruff","saffin","sander",
        "semplo","sheowl","shtsan","skylar","snogoo","sooshe","sooter1","sopsku1",
        "sora","spodov","sposan","towsol","wantat1","warwhe1","wesmea","wessan",
        "wetshe","whfibi","whiter","whttro","wiltur","yebcar","yefcan","zebdov",
    ] 

    scored_birds =  {"akiapo" : 0, "aniani" : 1, "apapan" : 2, "barpet" : 3, "crehon" : 4,
                    "elepai" : 5, "ercfra" : 6, "hawama" : 7, "hawcre": 8, "hawgoo" : 9, 
                    "hawhaw" : 10, "hawpet1" : 11, "houfin" : 12, "iiwi" :13, "jabwar" : 14,
                    "maupar" : 15, "omao" : 16, "puaioh" : 17, "skylar" : 18, "warwhe1" : 19, "yefcan" : 20}


    loader_params = {
        "test": {
            "batch_size": 64,
            "num_workers": 20,
            "shuffle": False
        }
    }

    pooling = "max"
    pretrained = True
    num_classes = 152
    in_channels = 1


## Define Model

In [6]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)


class Simple(nn.Module):
    def __init__(self, base_model_name: str, pretrained=False, num_classes=152, in_channels=1):
        super().__init__()
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length,
                                                 win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect",
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft,
                                                 n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
                                               freq_drop_width=8, freq_stripes_num=2)


        self.bn0 = nn.BatchNorm2d(CFG.n_mels)

        base_model = timm.create_model(
            base_model_name, pretrained=pretrained, in_chans=in_channels)
        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        if hasattr(base_model, "fc"):
            in_features = base_model.fc.in_features
        else:
            in_features = base_model.classifier.in_features
        self.fc1 = nn.Linear(in_features, num_classes, bias=True)
        self.gem = GeM()
        self.init_weight()

    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)

    def forward(self, input):
        # (batch_size, 1, time_steps, freq_bins)
        x = self.spectrogram_extractor(input)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)

        # (batch_size, channels)
        x = torch.squeeze(self.gem(x))
        x = F.dropout(x, p=0.5, training=self.training)
        logit = self.fc1(x)

        return logit

## Dataset

In [7]:
class TestDataset(torchdata.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray,
                 waveform_transforms=None):
        self.df = df
        self.clip = clip
        self.waveform_transforms=waveform_transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)

        start_index = SR * start_seconds
        end_index = SR * end_seconds

        y = self.clip[start_index:end_index].astype(np.float32)

        y = np.nan_to_num(y)

        if self.waveform_transforms:
            y = self.waveform_transforms(y)

        y = np.nan_to_num(y)

        return y, row_id

In [8]:
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None


def get_waveform_transforms(config: dict, phase: str):
    return get_transforms(config, phase)


def get_spectrogram_transforms(config: dict, phase: str):
    transforms = config.get('spectrogram_transforms')
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if hasattr(A, trns_name):
                trns_cls = A.__getattribute__(trns_name)
                trns_list.append(trns_cls(**trns_params))
            else:
                trns_cls = globals().get(trns_name)
                if trns_cls is not None:
                    trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return A.Compose(trns_list, p=1.0)
        else:
            return None


class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class NewNormalize:
    def __call__(self, y: np.ndarray):
        y_mm = y - y.mean()
        return y_mm / y_mm.abs().max()


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

## Get model

In [9]:
def prepare_model_for_inference(model, path: Path):
    if not torch.cuda.is_available():
        ckpt = torch.load(path, map_location="cpu")
    else:
        ckpt = torch.load(path)
    model.load_state_dict(ckpt)
    model.eval()
    return model

In [10]:
def prediction_for_clip(test_df: pd.DataFrame, 
                        clip: np.ndarray, 
                        model, 
                        threshold=0.5):

    dataset = TestDataset(df=test_df, 
                          clip=clip,
                          waveform_transforms=get_transforms(phase="test"))
    loader = torchdata.DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    prediction_dict = {}
    for image, row_id in tqdm(loader):
        row_id = row_id[0]
        image = image.to(device)

        with torch.no_grad():
            prediction = model(image)
            proba = torch.sigmoid(prediction).detach().cpu().numpy().reshape(-1)

        events = proba >= threshold
        labels = np.argwhere(events).reshape(-1).tolist()

        if len(labels) == 0:
            prediction_dict[row_id] = []
        else:
            labels_list = list(map(lambda x: CFG.target_columns[x], labels))
            prediction_dict[row_id] = labels_list
    return prediction_dict

In [11]:
def create_df(prediction_dict : dict):
    dfs = []
    for key, value in prediction_dict.items():
        elements = key.split("_")
        rows = ["_".join([elements[0], elements[1], bird, elements[2]]) for bird in CFG.scored_birds.keys()]
        is_found = [False] * len(CFG.scored_birds)
        for found in value:
            if found in CFG.scored_birds:
                is_found[CFG.scored_birds[found]] = True
        one_df = pd.DataFrame({"row_id": rows, "target": is_found})
        dfs.append(one_df)
    df = pd.concat(dfs, axis=0, sort=False).reset_index(drop=True)
    return df
        

def prediction(test_audios, weights_path: Path, base_model, threshold=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Simple(base_model_name=base_model,
                    pretrained=False,
                    num_classes=CFG.num_classes,
                    in_channels=CFG.in_channels)
    model = prepare_model_for_inference(model, weights_path).to(device)

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    for audio_path in test_audios:
        with timer(f"Loading {str(audio_path)}", logger):
            clip, _ = sf.read(audio_path)

        seconds = []
        row_ids = []
        for second in range(5, 65, 5):
            row_id = "_".join(re.split("[_.]", audio_path.name)[:2]) + f"_{second}"
            seconds.append(second)
            row_ids.append(row_id)
            
        test_df = pd.DataFrame({
            "row_id": row_ids,
            "seconds": seconds
        })
        with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_for_clip(test_df,
                                                  clip=clip,
                                                  model=model,
                                                  threshold=threshold)

        prediction_df = create_df(prediction_dict)
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [12]:
exp1 = Path('../input/birdclef2022-pth3/exp1.pth')
exp2 = Path('../input/birdclef2022-pth3/exp5.pth')

In [13]:
sample = pd.read_csv('../input/birdclef-2022/sample_submission.csv')
DATADIR = Path("../input/birdclef-2022/test_soundscapes")
all_audios = list(DATADIR.glob("*.ogg"))

pred1 = prediction(test_audios=all_audios, weights_path=exp1, threshold=0.09, base_model = 'resnet101d')
pred2 = prediction(test_audios=all_audios, weights_path=exp2, threshold=0.11, base_model = 'resnext50d_32x4d')

preds = pd.merge(pred1, pred2, on = 'row_id')
#preds = pd.merge(preds, pred3, on = 'row_id')

preds['target'] = (preds.iloc[:,1]) | (preds.iloc[:,2])
submission = preds.loc[:, ['row_id', 'target']]


submission.to_csv("submission.csv", index=False)

  fft_window = librosa.util.pad_center(fft_window, n_fft)
2022-05-01 05:21:41,799 - INFO - [Loading ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] start
2022-05-01 05:21:41,874 - INFO - [Loading ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] done in 0.07 s
2022-05-01 05:21:41,876 - INFO - [Prediction on ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] start
100%|██████████| 12/12 [00:05<00:00,  2.04it/s]
2022-05-01 05:21:47,781 - INFO - [Prediction on ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] done in 5.90 s
2022-05-01 05:21:50,832 - INFO - [Loading ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] start
2022-05-01 05:21:50,886 - INFO - [Loading ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] done in 0.05 s
2022-05-01 05:21:50,887 - INFO - [Prediction on ../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg] start
100%|██████████| 12/12 [00:00<00:00, 60.88it/s]
202

In [14]:
pred1.target.value_counts()

False    173
True      79
Name: target, dtype: int64

In [15]:
pred2.target.value_counts()

False    190
True      62
Name: target, dtype: int64

In [16]:
submission.target.value_counts()

False    161
True      91
Name: target, dtype: int64