In [1]:
!pip install ../input/bird-panns/torchlibrosa-master/torchlibrosa-master/
!pip install /kaggle/input/noisereduce/noisereduce-1.0.1-py3-none-any.whl

!pip install ../input/keras-application/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/efficientnettf/efficientnet-master

Processing /kaggle/input/bird-panns/torchlibrosa-master/torchlibrosa-master
Building wheels for collected packages: torchlibrosa
  Building wheel for torchlibrosa (setup.py) ... [?25l- \ done
[?25h  Created wheel for torchlibrosa: filename=torchlibrosa-0.0.4-py3-none-any.whl size=8864 sha256=52d61ef7df26eee4b6bb637bb56d7a1fa63b5fff5e9290e51be2151772176368
  Stored in directory: /root/.cache/pip/wheels/52/35/08/7a90aa926e1403318b7b36ba5a03ad940bd5002dad343b8c1f
Successfully built torchlibrosa
Installing collected packages: torchlibrosa
Successfully installed torchlibrosa-0.0.4
Processing /kaggle/input/noisereduce/noisereduce-1.0.1-py3-none-any.whl
Installing collected packages: noisereduce
Successfully installed noisereduce-1.0.1
Processing /kaggle/input/efficientnettf/efficientnet-master
Building wheels for collected packages: efficientnet
  Building wheel for efficientnet (setup.py) ... [?25l- \ done
[?25h  Created wheel for efficientnet: filename=efficientne

In [2]:
import os
import gc
import time
import math
import shutil
import random
import warnings
import typing as tp
from pathlib import Path
from contextlib import contextmanager

import yaml
import logging
from joblib import delayed, Parallel

import cv2
import librosa
import audioread
import soundfile as sf
import noisereduce as nr

import numpy as np
import pandas as pd

from fastprogress import progress_bar
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
from torch.nn.modules.utils import _pair
import torch.utils.data as data
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

import tensorflow as tf, re, math
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
import efficientnet.tfkeras as efn 
import scipy.signal
from pathlib import Path
from typing import Optional

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

  from tqdm.autonotebook import tqdm


In [3]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
    
def get_logger(out_file=None):
    logger = logging.getLogger()
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    logger.handlers = []
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
    logger.info("logger set up")
    return logger
    
    
@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"[{name}] start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"[{name}] done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)

In [4]:
logger = get_logger("main.log")
set_seed(42)

2020-09-14 23:30:07,165 - INFO - logger set up


In [5]:
SR = 32000
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"
# TRAIN_RESAMPLED_AUDIO_DIRS = [
#   INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
# ]
TEST_AUDIO_DIR = RAW_DATA / "test_audio"


TARGET_SR = 32000
TEST = Path("../input/birdsong-recognition/test_audio").exists()

if TEST:
    DATA_DIR = Path("../input/birdsong-recognition/")
else:
    DATA_DIR = Path("../input/birdcall-check/")
    
test_tf = pd.read_csv(DATA_DIR / "test.csv")
test_audio = DATA_DIR / "test_audio"

In [6]:
ENVELOPE = 0.02

def envelope(y, rate, threshold):
    y_mean = maximum_filter1d(np.abs(y), mode="constant", size=rate//20)
    mask = [mean > threshold for mean in y_mean]
    return mask

In [7]:
train = pd.read_csv(RAW_DATA / "train.csv")

In [8]:
if not TEST_AUDIO_DIR.exists():
    TEST_AUDIO_DIR = INPUT_ROOT / "birdcall-check" / "test_audio"
    test = pd.read_csv(INPUT_ROOT / "birdcall-check" / "test.csv")
else:
    test = pd.read_csv(RAW_DATA / "test.csv")

In [9]:
sub = pd.read_csv("../input/birdsong-recognition/sample_submission.csv")
sub.to_csv("submission.csv", index=False)  # this will be overwritten if everything goes well

In [10]:
BIRD_CODE = {
    'aldfly': 0, 'ameavo': 1, 'amebit': 2, 'amecro': 3, 'amegfi': 4,
    'amekes': 5, 'amepip': 6, 'amered': 7, 'amerob': 8, 'amewig': 9,
    'amewoo': 10, 'amtspa': 11, 'annhum': 12, 'astfly': 13, 'baisan': 14,
    'baleag': 15, 'balori': 16, 'banswa': 17, 'barswa': 18, 'bawwar': 19,
    'belkin1': 20, 'belspa2': 21, 'bewwre': 22, 'bkbcuc': 23, 'bkbmag1': 24,
    'bkbwar': 25, 'bkcchi': 26, 'bkchum': 27, 'bkhgro': 28, 'bkpwar': 29,
    'bktspa': 30, 'blkpho': 31, 'blugrb1': 32, 'blujay': 33, 'bnhcow': 34,
    'boboli': 35, 'bongul': 36, 'brdowl': 37, 'brebla': 38, 'brespa': 39,
    'brncre': 40, 'brnthr': 41, 'brthum': 42, 'brwhaw': 43, 'btbwar': 44,
    'btnwar': 45, 'btywar': 46, 'buffle': 47, 'buggna': 48, 'buhvir': 49,
    'bulori': 50, 'bushti': 51, 'buwtea': 52, 'buwwar': 53, 'cacwre': 54,
    'calgul': 55, 'calqua': 56, 'camwar': 57, 'cangoo': 58, 'canwar': 59,
    'canwre': 60, 'carwre': 61, 'casfin': 62, 'caster1': 63, 'casvir': 64,
    'cedwax': 65, 'chispa': 66, 'chiswi': 67, 'chswar': 68, 'chukar': 69,
    'clanut': 70, 'cliswa': 71, 'comgol': 72, 'comgra': 73, 'comloo': 74,
    'commer': 75, 'comnig': 76, 'comrav': 77, 'comred': 78, 'comter': 79,
    'comyel': 80, 'coohaw': 81, 'coshum': 82, 'cowscj1': 83, 'daejun': 84,
    'doccor': 85, 'dowwoo': 86, 'dusfly': 87, 'eargre': 88, 'easblu': 89,
    'easkin': 90, 'easmea': 91, 'easpho': 92, 'eastow': 93, 'eawpew': 94,
    'eucdov': 95, 'eursta': 96, 'evegro': 97, 'fiespa': 98, 'fiscro': 99,
    'foxspa': 100, 'gadwal': 101, 'gcrfin': 102, 'gnttow': 103, 'gnwtea': 104,
    'gockin': 105, 'gocspa': 106, 'goleag': 107, 'grbher3': 108, 'grcfly': 109,
    'greegr': 110, 'greroa': 111, 'greyel': 112, 'grhowl': 113, 'grnher': 114,
    'grtgra': 115, 'grycat': 116, 'gryfly': 117, 'haiwoo': 118, 'hamfly': 119,
    'hergul': 120, 'herthr': 121, 'hoomer': 122, 'hoowar': 123, 'horgre': 124,
    'horlar': 125, 'houfin': 126, 'houspa': 127, 'houwre': 128, 'indbun': 129,
    'juntit1': 130, 'killde': 131, 'labwoo': 132, 'larspa': 133, 'lazbun': 134,
    'leabit': 135, 'leafly': 136, 'leasan': 137, 'lecthr': 138, 'lesgol': 139,
    'lesnig': 140, 'lesyel': 141, 'lewwoo': 142, 'linspa': 143, 'lobcur': 144,
    'lobdow': 145, 'logshr': 146, 'lotduc': 147, 'louwat': 148, 'macwar': 149,
    'magwar': 150, 'mallar3': 151, 'marwre': 152, 'merlin': 153, 'moublu': 154,
    'mouchi': 155, 'moudov': 156, 'norcar': 157, 'norfli': 158, 'norhar2': 159,
    'normoc': 160, 'norpar': 161, 'norpin': 162, 'norsho': 163, 'norwat': 164,
    'nrwswa': 165, 'nutwoo': 166, 'olsfly': 167, 'orcwar': 168, 'osprey': 169,
    'ovenbi1': 170, 'palwar': 171, 'pasfly': 172, 'pecsan': 173, 'perfal': 174,
    'phaino': 175, 'pibgre': 176, 'pilwoo': 177, 'pingro': 178, 'pinjay': 179,
    'pinsis': 180, 'pinwar': 181, 'plsvir': 182, 'prawar': 183, 'purfin': 184,
    'pygnut': 185, 'rebmer': 186, 'rebnut': 187, 'rebsap': 188, 'rebwoo': 189,
    'redcro': 190, 'redhea': 191, 'reevir1': 192, 'renpha': 193, 'reshaw': 194,
    'rethaw': 195, 'rewbla': 196, 'ribgul': 197, 'rinduc': 198, 'robgro': 199,
    'rocpig': 200, 'rocwre': 201, 'rthhum': 202, 'ruckin': 203, 'rudduc': 204,
    'rufgro': 205, 'rufhum': 206, 'rusbla': 207, 'sagspa1': 208, 'sagthr': 209,
    'savspa': 210, 'saypho': 211, 'scatan': 212, 'scoori': 213, 'semplo': 214,
    'semsan': 215, 'sheowl': 216, 'shshaw': 217, 'snobun': 218, 'snogoo': 219,
    'solsan': 220, 'sonspa': 221, 'sora': 222, 'sposan': 223, 'spotow': 224,
    'stejay': 225, 'swahaw': 226, 'swaspa': 227, 'swathr': 228, 'treswa': 229,
    'truswa': 230, 'tuftit': 231, 'tunswa': 232, 'veery': 233, 'vesspa': 234,
    'vigswa': 235, 'warvir': 236, 'wesblu': 237, 'wesgre': 238, 'weskin': 239,
    'wesmea': 240, 'wessan': 241, 'westan': 242, 'wewpew': 243, 'whbnut': 244,
    'whcspa': 245, 'whfibi': 246, 'whtspa': 247, 'whtswi': 248, 'wilfly': 249,
    'wilsni1': 250, 'wiltur': 251, 'winwre3': 252, 'wlswar': 253, 'wooduc': 254,
    'wooscj2': 255, 'woothr': 256, 'y00475': 257, 'yebfly': 258, 'yebsap': 259,
    'yehbla': 260, 'yelwar': 261, 'yerwar': 262, 'yetvir': 263
}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [11]:
# with padding sequence as the 5 sec sound

class TestDataset(data.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray):
        self.df = df
        self.clip = clip
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        site = sample.site
        row_id = sample.row_id
        if site == "site_3":
            y = self.clip.astype(np.float32)
            len_y = len(y)
            start = 0
            end = SR * 5
            y_all = []
            while len_y > start:
                y_batch = y[start:end].astype(np.float32)
                if len(y_batch) != (SR * 5):
                    y_pad = np.zeros(5 * SR, dtype=np.float32)
                    y_pad[:len(y_batch)] = y_batch
                    y_all.append(y_pad)
                    break
                start = end
                end = end + SR * 5
                y_all.append(y_batch)
            y_all = np.asarray(y_all)
            y_all = np.tile(y_all,(2,))
            return y_all, row_id, site
        else:
            end_seconds = int(sample.seconds)
            start_seconds = int(end_seconds - 5)
            start_index = SR * start_seconds
            end_index = SR * end_seconds
            y = self.clip[start_index:end_index].astype(np.float32)
            y = np.tile(y,(2,))
        return y, row_id, site

In [12]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
    
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)
def _resnet_conv3x3(in_planes, out_planes):
    #3x3 convolution with padding
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=1,
                     padding=1, groups=1, bias=False, dilation=1)


def _resnet_conv1x1(in_planes, out_planes):
    #1x1 convolution
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False)
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')
        
        return x
class _ResnetBasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(_ResnetBasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('_ResnetBasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in _ResnetBasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1

        self.stride = stride

        self.conv1 = _resnet_conv3x3(inplanes, planes)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = _resnet_conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

        self.init_weights()

    def init_weights(self):
        init_layer(self.conv1)
        init_bn(self.bn1)
        init_layer(self.conv2)
        init_bn(self.bn2)
        nn.init.constant_(self.bn2.weight, 0)

    def forward(self, x):
        identity = x

        if self.stride == 2:
            out = F.avg_pool2d(x, kernel_size=(2, 2))
        else:
            out = x

        out = self.conv1(out)
        out = self.bn1(out)
        out = self.relu(out)
        out = F.dropout(out, p=0.1, training=self.training)

        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(identity)

        out += identity
        out = self.relu(out)

        return out

class _ResNet(nn.Module):
    def __init__(self, block, layers, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(_ResNet, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group

        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            if stride == 1:
                downsample = nn.Sequential(
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[0])
                init_bn(downsample[1])
            elif stride == 2:
                downsample = nn.Sequential(
                    nn.AvgPool2d(kernel_size=2), 
                    _resnet_conv1x1(self.inplanes, planes * block.expansion),
                    norm_layer(planes * block.expansion),
                )
                init_layer(downsample[1])
                init_bn(downsample[2])

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        return x
class ResNet38(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        
        super(ResNet38, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBasicBlock, layers=[3, 4, 6, 3], zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=512, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()

    def init_weights(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)


    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.resnet(x)
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = self.conv_block_after1(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training, inplace=True)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

In [13]:
class ConvPreWavBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvPreWavBlock, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=3, stride=1,
                              padding=1, bias=False)
                              
        self.conv2 = nn.Conv1d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=3, stride=1, dilation=2, 
                              padding=2, bias=False)
                              
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.bn2 = nn.BatchNorm1d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        x = F.max_pool1d(x, kernel_size=pool_size)
        
        return x
    
class Wavegram_Logmel_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Wavegram_Logmel_Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        self.pre_conv0 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=11, stride=5, padding=5, bias=False)
        self.pre_bn0 = nn.BatchNorm1d(64)
        self.pre_block1 = ConvPreWavBlock(64, 64)
        self.pre_block2 = ConvPreWavBlock(64, 128)
        self.pre_block3 = ConvPreWavBlock(128, 128)
        self.pre_block4 = ConvBlock(in_channels=4, out_channels=64)

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=128, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_layer(self.pre_conv0)
        init_bn(self.pre_bn0)
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        # Wavegram
        a1 = F.relu_(self.pre_bn0(self.pre_conv0(input[:, None, :])))
        a1 = self.pre_block1(a1, pool_size=4)
        a1 = self.pre_block2(a1, pool_size=4)
        a1 = self.pre_block3(a1, pool_size=4)
        a1 = a1.reshape((a1.shape[0], -1, 32, a1.shape[-1])).transpose(2, 3)
        a1 = self.pre_block4(a1, pool_size=(2, 1))

        # Log mel spectrogram
        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
            a1 = do_mixup(a1, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')

        # Concatenate Wavegram and Log mel spectrogram along the channel dimension
        x = torch.cat((x, a1), dim=1)

        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

In [14]:
def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.

    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [15]:
class Transfer_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        super(Transfer_Cnn14, self).__init__()
        audioset_classes_num = 527
        
        self.base = Wavegram_Logmel_Cnn14(sample_rate, window_size, 
                                          hop_size, mel_bins, fmin,
                                          fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.att_block = AttBlock(2048, classes_num, activation='sigmoid')
        self.interpolate_ratio = 32
        self.init_weight()
    def init_weight(self):
        init_layer(self.fc1)
        
    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path)
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        base_output = self.base(input, mixup_lambda)
        x = base_output['embedding']
        frames_num = base_output['frames_num']
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        output_dict = {
            'framewise_output': framewise_output,
            'clipwise_output': clipwise_output
        }

        return output_dict

In [16]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
    
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')
        
        return x
    
class Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict


In [17]:
def get_model(weight_path=None):
    
    model_config = {
        "sample_rate": 32000,
        "window_size": 1024,
        "hop_size": 320,
        "mel_bins": 64,
        "fmin": 50,
        "fmax": 14000,
        "classes_num":264
        }

    model = ResNet38(**model_config)
    # model.fc_audioset = nn.Linear(2048, num_classes, bias=True)
    # init_layer(model.fc_audioset)
    if weight_path:
        print("load pretrain weight: {}".format(weight_path))
        weights = torch.load(weight_path, map_location=torch.device('cpu'))
        model.load_state_dict(weights['model_state_dict'])
    model.cuda()
    model.eval()
    
    return model

In [18]:
def get_model_cnn14(weight_path=None):
    
    model_config = {
        "sample_rate": 32000,
        "window_size": 1024,
        "hop_size": 320,
        "mel_bins": 64,
        "fmin": 50,
        "fmax": 14000,
        "classes_num":264
        }

    model = Cnn14(**model_config)
    # model.fc_audioset = nn.Linear(2048, num_classes, bias=True)
    # init_layer(model.fc_audioset)
    if weight_path:
        print("load pretrain weight: {}".format(weight_path))
        weights = torch.load(weight_path, map_location=torch.device('cpu'))
        model.load_state_dict(weights['model_state_dict'])
    model.cuda()
    model.eval()
    
    return model

## Prediction loop

In [19]:
def prediction_for_clip(test_df: pd.DataFrame, 
                        clip: np.ndarray, 
                        models, 
                        threshold=0.5):

    dataset = TestDataset(df=test_df, 
    clip=clip)
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    models[0].eval()
    models[1].eval()
    models[2].eval()
    models[3].eval()
    models[4].eval()
    models[5].eval()
    models[6].eval()
    models[7].eval()
    models[8].eval()
    models[9].eval()
    
    
    prediction_dict = {}
    for image, row_id, site in progress_bar(loader):
        site = site[0]
        row_id = row_id[0]
        image = image.to(device).float()
        if site in {"site_1", "site_2"}:
            image = image.to(device).float()

            with torch.no_grad():
                prediction1 = models[0](image)
                proba1 = prediction1['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction2 = models[1](image)
                proba2 = prediction2['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction3 = models[2](image)
                proba3 = prediction3['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction4 = models[3](image)
                proba4 = prediction4['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction5 = models[4](image)
                proba5 = prediction5['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction6 = models[5](image)
                proba6 = prediction6['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction7 = models[6](image)
                proba7 = prediction7['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction8 = models[7](image)
                proba8 = prediction8['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction9 = models[8](image)
                proba9 = prediction9['clipwise_output'].detach().cpu().numpy().reshape(-1)
                
                prediction10 = models[9](image)
                proba10 = prediction10['clipwise_output'].detach().cpu().numpy().reshape(-1)
                     
                proba = (proba1*0.045+proba2*0.05 +proba3*0.12+proba4*0.13+proba5*0.035+proba6*0.09+proba7*0.08+proba8*0.21+proba9*0.19+proba10*0.05)
            #print(proba.shape, "for site 1 and 2")
#             events = proba >= threshold
#             #print(len(events), "for site 1 and 2")
#             labels = np.argwhere(events).reshape(-1).tolist()

        else:
            # to avoid prediction on large batch
            image = image.squeeze(0)
            batch_size = 16
            whole_size = image.size(0)
            if whole_size % batch_size == 0:
                n_iter = whole_size // batch_size
            else:
                n_iter = whole_size // batch_size + 1
                
            all_events = set()
            for batch_i in range(n_iter):
                batch = image[batch_i * batch_size:(batch_i + 1) * batch_size]
                if batch.ndim == 3:
                    batch = batch.unsqueeze(0)

                batch = batch.to(device)
                with torch.no_grad():
                    prediction1 = models[0](batch)
                    proba1 = prediction1['clipwise_output'].detach().cpu().numpy()
                
                    prediction2 = models[1](batch)
                    proba2 = prediction2['clipwise_output'].detach().cpu().numpy()
                
                    prediction3 = models[2](batch)
                    proba3 = prediction3['clipwise_output'].detach().cpu().numpy()
                
                    prediction4 = models[3](batch)
                    proba4 = prediction4['clipwise_output'].detach().cpu().numpy()
                    
                    prediction5 = models[4](batch)
                    proba5 = prediction5['clipwise_output'].detach().cpu().numpy()
                    
                    prediction6 = models[5](batch)
                    proba6 = prediction6['clipwise_output'].detach().cpu().numpy()
                    
                    prediction7 = models[6](batch)
                    proba7 = prediction7['clipwise_output'].detach().cpu().numpy()
                    
                    prediction8 = models[7](batch)
                    proba8 = prediction8['clipwise_output'].detach().cpu().numpy()
                    
                    prediction9 = models[8](batch)
                    proba9 = prediction9['clipwise_output'].detach().cpu().numpy()
                    
                    prediction10 = models[9](batch)
                    proba10 = prediction10['clipwise_output'].detach().cpu().numpy()
                    
                proba = (proba1*0.045+proba2*0.05 +proba3*0.12+proba4*0.13+proba5*0.035+proba6*0.09+proba7*0.08+proba8*0.21+proba9*0.19+proba10*0.05)
                    
#                     prediction6 = models[5](batch)
#                     proba6 = prediction6['clipwise_output'].detach().cpu().numpy()
                    
#                 proba = (proba1*0.09 + proba2*0.14 + proba3*0.22 + proba4*0.22 + proba5*0.09 + proba6*0.24)
#                 #print(proba.shape, "===========")
#                 events = proba >= 0.3
#                 #print(len(events), "=========")
#                 for i in range(len(events)):
#                     event = events[i, :]
#                     labels = np.argwhere(event).reshape(-1).tolist()
#                     for label in labels:
#                         all_events.add(label)
                        
#             labels = list(all_events)
#         if len(labels) == 0:
#             prediction_dict[row_id] = "nocall"
#         else:
#             labels_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
#             label_string = " ".join(labels_str_list)
#             prediction_dict[row_id] = label_string
        prediction_dict[row_id] = proba
                  
    del models        
    return prediction_dict

In [20]:
def prediction(test_df: pd.DataFrame,
               test_audio: Path,
               weight_path: Path,
               target_sr: int,
               threshold=0.5):
    models_resnets = []
    
    model1 = get_model(weight_path[0])
    models_resnets.append(model1)
    model2 = get_model(weight_path[1])
    models_resnets.append(model2)
    model3 = get_model(weight_path[2])
    models_resnets.append(model3)
    model4 = get_model(weight_path[3])
    models_resnets.append(model4)
    model5 = get_model(weight_path[4])
    models_resnets.append(model5)
    model6 = get_model_cnn14(weight_path[5])
    models_resnets.append(model6)
    model7 = get_model_cnn14(weight_path[6])
    models_resnets.append(model7)
    model8 = get_model_cnn14(weight_path[7])
    models_resnets.append(model8)
    model9 = get_model_cnn14(weight_path[8])
    models_resnets.append(model9)
    model10 = get_model_cnn14(weight_path[9])
    models_resnets.append(model10)
    
    unique_audio_id = test_df.audio_id.unique()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    for audio_id in unique_audio_id:
        with timer(f"Loading {audio_id}"):
            clip, _ = librosa.load(test_audio / (audio_id + ".mp3"),
                                   sr=target_sr,
                                   mono=True,
                                   res_type="kaiser_fast")
        
        test_df_for_audio_id = test_df.query(
            f"audio_id == '{audio_id}'").reset_index(drop=True)
        with timer(f"Prediction on {audio_id}"):
            prediction_dict = prediction_for_clip(test_df_for_audio_id,
                                                  clip=clip,
                                                  models=models_resnets,
                                                  threshold=threshold)
        row_id = list(prediction_dict.keys())
        birds = list(prediction_dict.values())
        prediction_df = pd.DataFrame({
            "row_id": row_id,
            "birds": birds
        })
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

## Prediction

In [21]:
submission_torch = prediction(test_df=test,
                           test_audio=TEST_AUDIO_DIR,
                           weight_path=["../input/resnest-38-fold-0-epochs-80/panns/best-acc-checkpoint.bin",#cv 0.71340
                                        "../input/resnest-38-fold-1-85-epochs/panns/best-acc-checkpoint.bin",#cv 0.71453
                                        "../input/resnest-38-fold-2-85-epochs/panns/best-acc-checkpoint.bin",#cv 0.72323
                                        "../input/resnest-38-fold-3-85-epochs/panns/best-acc-checkpoint.bin",#cv 0.72410
                                        "../input/resnest-38-fold-4-85-epochs/panns/best-acc-checkpoint.bin",#cv 0.70925
                                        "../input/cnn14-fold-0-80-epochs/panns/best-acc-checkpoint.bin",# cv 0.72459
                                        "../input/cnn-14-80epochs/best-acc-checkpoint (2).bin", #cv .721
                                        "../input/cnn14-fold-2/panns/best-acc-checkpoint.bin",#cv . 0.73896
                                        "../input/cnn14-fold3-40-epochs/panns/best-acc-checkpoint.bin",#cv . 0.73809,
                                        "../input/cnn14-80epochs-fold4/best-acc-checkpoint.bin" #cv . 0.71574,
                                       ],
                           target_sr=32000,
                           threshold=0.56)
# submission.to_csv("submission.csv", index=False)

load pretrain weight: ../input/resnest-38-fold-0-epochs-80/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/resnest-38-fold-1-85-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/resnest-38-fold-2-85-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/resnest-38-fold-3-85-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/resnest-38-fold-4-85-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/cnn14-fold-0-80-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/cnn-14-80epochs/best-acc-checkpoint (2).bin
load pretrain weight: ../input/cnn14-fold-2/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/cnn14-fold3-40-epochs/panns/best-acc-checkpoint.bin
load pretrain weight: ../input/cnn14-80epochs-fold4/best-acc-checkpoint.bin
[Loading 41e6fe6504a34bf6846938ba78d13df1] start


2020-09-14 23:30:39,392 - INFO - NumExpr defaulting to 2 threads.


[Loading 41e6fe6504a34bf6846938ba78d13df1] done in 1.37 s
[Prediction on 41e6fe6504a34bf6846938ba78d13df1] start


[Prediction on 41e6fe6504a34bf6846938ba78d13df1] done in 1.98 s
[Loading cce64fffafed40f2b2f3d3413ec1c4c2] start
[Loading cce64fffafed40f2b2f3d3413ec1c4c2] done in 0.80 s
[Prediction on cce64fffafed40f2b2f3d3413ec1c4c2] start


[Prediction on cce64fffafed40f2b2f3d3413ec1c4c2] done in 0.83 s
[Loading 99af324c881246949408c0b1ae54271f] start
[Loading 99af324c881246949408c0b1ae54271f] done in 0.84 s
[Prediction on 99af324c881246949408c0b1ae54271f] start


[Prediction on 99af324c881246949408c0b1ae54271f] done in 0.82 s
[Loading 6ab74e177aa149468a39ca10beed6222] start
[Loading 6ab74e177aa149468a39ca10beed6222] done in 0.71 s
[Prediction on 6ab74e177aa149468a39ca10beed6222] start


[Prediction on 6ab74e177aa149468a39ca10beed6222] done in 0.66 s
[Loading b2fd3f01e9284293a1e33f9c811a2ed6] start
[Loading b2fd3f01e9284293a1e33f9c811a2ed6] done in 0.73 s
[Prediction on b2fd3f01e9284293a1e33f9c811a2ed6] start


[Prediction on b2fd3f01e9284293a1e33f9c811a2ed6] done in 0.76 s
[Loading de62b37ebba749d2abf29d4a493ea5d4] start
[Loading de62b37ebba749d2abf29d4a493ea5d4] done in 0.36 s
[Prediction on de62b37ebba749d2abf29d4a493ea5d4] start


[Prediction on de62b37ebba749d2abf29d4a493ea5d4] done in 0.12 s
[Loading 8680a8dd845d40f296246dbed0d37394] start
[Loading 8680a8dd845d40f296246dbed0d37394] done in 0.86 s
[Prediction on 8680a8dd845d40f296246dbed0d37394] start


[Prediction on 8680a8dd845d40f296246dbed0d37394] done in 0.97 s
[Loading 940d546e5eb745c9a74bce3f35efa1f9] start
[Loading 940d546e5eb745c9a74bce3f35efa1f9] done in 1.24 s
[Prediction on 940d546e5eb745c9a74bce3f35efa1f9] start


[Prediction on 940d546e5eb745c9a74bce3f35efa1f9] done in 1.51 s
[Loading 07ab324c602e4afab65ddbcc746c31b5] start
[Loading 07ab324c602e4afab65ddbcc746c31b5] done in 0.79 s
[Prediction on 07ab324c602e4afab65ddbcc746c31b5] start


[Prediction on 07ab324c602e4afab65ddbcc746c31b5] done in 0.61 s
[Loading 899616723a32409c996f6f3441646c2a] start
[Loading 899616723a32409c996f6f3441646c2a] done in 0.93 s
[Prediction on 899616723a32409c996f6f3441646c2a] start


[Prediction on 899616723a32409c996f6f3441646c2a] done in 1.10 s
[Loading 9cc5d9646f344f1bbb52640a988fe902] start
[Loading 9cc5d9646f344f1bbb52640a988fe902] done in 3.74 s
[Prediction on 9cc5d9646f344f1bbb52640a988fe902] start


[Prediction on 9cc5d9646f344f1bbb52640a988fe902] done in 4.61 s
[Loading a56e20a518684688a9952add8a9d5213] start
[Loading a56e20a518684688a9952add8a9d5213] done in 0.70 s
[Prediction on a56e20a518684688a9952add8a9d5213] start


[Prediction on a56e20a518684688a9952add8a9d5213] done in 1.43 s
[Loading 96779836288745728306903d54e264dd] start
[Loading 96779836288745728306903d54e264dd] done in 0.51 s
[Prediction on 96779836288745728306903d54e264dd] start


[Prediction on 96779836288745728306903d54e264dd] done in 1.03 s
[Loading f77783ba4c6641bc918b034a18c23e53] start
[Loading f77783ba4c6641bc918b034a18c23e53] done in 0.40 s
[Prediction on f77783ba4c6641bc918b034a18c23e53] start


[Prediction on f77783ba4c6641bc918b034a18c23e53] done in 0.79 s
[Loading 856b194b097441958697c2bcd1f63982] start
[Loading 856b194b097441958697c2bcd1f63982] done in 0.65 s
[Prediction on 856b194b097441958697c2bcd1f63982] start


[Prediction on 856b194b097441958697c2bcd1f63982] done in 0.44 s


In [22]:
def memory_cleanup():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            del obj
    gc.collect()
    torch.cuda.empty_cache()
    
memory_cleanup()

## TF

In [23]:
def rescale(x):
    if np.max(x)-np.min(x)>1e-8:
        return (x-np.min(x))/(np.max(x)-np.min(x))#*255.
    else:
        return (x-np.min(x))/(1e-8)#*255.
    
def preEmphasis(signal, p=0.97):
    return scipy.signal.lfilter([1.0, -p], 1, signal)

def mono_to_color(X: np.ndarray,
                  Y: np.ndarray,
                  Z: np.ndarray,
                  mean=None,
                  std=None,
                  norm_max=None,
                  norm_min=None,
                  eps=1e-6):

    X = np.stack([Z, Y, X], axis=-1)
    
    for j in range(3):
        X[:,:,j] = rescale(X[:,:,j])
        

    return X*255
   


class TestDatasetTF(data.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray,
                 img_size=313, melspectrogram_parameters={}):
        self.df = df
        self.clip = clip
        self.img_size = img_size
        self.melspectrogram_parameters = melspectrogram_parameters
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        site = sample.site
        row_id = sample.row_id
        
        if site == "site_3":
            y = self.clip.astype(np.float32)
            len_y = len(y)
            start = 0
            end = SR * 5
            images = []
            while len_y > start:
                y_batch = y[start:end].astype(np.float32)
                if len(y_batch) != (SR * 5):
                    break
                start = end
                end = end + SR * 5
                y_batch = preEmphasis(y_batch)
                y_batch = y_batch / np.max(np.abs(y_batch))
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **self.melspectrogram_parameters)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec,melspec,melspec)
                height, width, _ = image.shape
                image = (image / 255.0).astype(np.float32)
                images.append(image)
            images = np.asarray(images)
            return images, row_id, site
        else:
            end_seconds = int(sample.seconds)
            start_seconds = int(end_seconds - 5)
            
            start_index = SR * start_seconds
            end_index = SR * end_seconds
            
            y = self.clip[start_index:end_index].astype(np.float32)
            y = preEmphasis(y)
            y = y / np.max(np.abs(y))
            melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspectrogram_parameters)
            melspec = librosa.power_to_db(melspec).astype(np.float32)
            image = mono_to_color(melspec,melspec,melspec)
            height, width, _ = image.shape
            image = (image / 255.0).astype(np.float32)

            return image, row_id, site

In [24]:
def round_filters(filters, width_coefficient, depth_divisor):
    """Round number of filters based on width multiplier."""

    filters *= width_coefficient
    new_filters = int(filters + depth_divisor / 2) // depth_divisor * depth_divisor
    new_filters = max(depth_divisor, new_filters)
    # Make sure that round down does not go down by more than 10%.
    if new_filters < 0.9 * filters:
        new_filters += depth_divisor
    return int(new_filters)

CONV_KERNEL_INITIALIZER = {
    'class_name': 'VarianceScaling',
    'config': {
        'scale': 2.0,
        'mode': 'fan_out',
        # EfficientNet actually uses an untruncated normal distribution for
        # initializing conv layers, but keras.initializers.VarianceScaling use
        # a truncated distribution.
        # We decided against a custom initializer for better serializability.
        'distribution': 'normal'
    }
}

width_coefficient = 1.6
depth_divisor = 2.2

def efficientnet_params(model_name):
    """Get efficientnet params based on model name."""
    params_dict = {
        # (width_coefficient, depth_coefficient, resolution, dropout_rate)
        "efficientnet-b0": (1.0, 1.0, 224, 0.2),
        "efficientnet-b1": (1.0, 1.1, 240, 0.2),
        "efficientnet-b2": (1.1, 1.2, 260, 0.3),
        "efficientnet-b3": (1.2, 1.4, 300, 0.3),
        "efficientnet-b4": (1.4, 1.8, 380, 0.4),
        "efficientnet-b5": (1.6, 2.2, 456, 0.4),
        "efficientnet-b6": (1.8, 2.6, 528, 0.5),
        "efficientnet-b7": (2.0, 3.1, 600, 0.5),
    }
    return params_dict[model_name]

# channels_last = 3

def build_model():
    inp = tf.keras.layers.Input(shape=(128,313,3))
    base = efn.EfficientNetB5(weights = None, include_top = False)

    x = tf.keras.layers.Conv2D(3, 3,
                      strides=(1, 1),
                      padding='same',
                      use_bias=False,
                      kernel_initializer=CONV_KERNEL_INITIALIZER,
                      name='keep_res')(inp)
    x = tf.keras.layers.BatchNormalization(axis=3, name='res_bn')(x)
    x = tf.keras.layers.Conv2D(3, 3,
                      strides=(1, 1),
                      padding='same',
                      use_bias=False,
                      kernel_initializer=CONV_KERNEL_INITIALIZER,
                      name='keep_res2')(x)
    x = tf.keras.layers.BatchNormalization(axis=3, name='res_bn2')(x)
    # x = tf.keras.activations.swish(x)
    
    x = base(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(264,activation='sigmoid', dtype='float32')(x)
    model = tf.keras.Model(inputs=inp,outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=opt,loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05),metrics=[])
    return model

In [25]:
models = []
for i in range(4):
    model = build_model()
    model.load_weights(f"../input/efficientnet-weights/b5augs_fold_{i}.h5")
    models.append(model)
    del model

In [26]:
device = torch.device("cuda")
def prediction_for_clip_tf(test_df: pd.DataFrame, 
                           clip: np.ndarray, 
                           mel_params: dict, 
                           threshold=0.5):

    dataset = TestDatasetTF(df=test_df, 
                          clip=clip,
                          img_size=313,
                          melspectrogram_parameters=mel_params)
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    
    
    #model.eval()
    prediction_dict = {}
    for image, row_id, site in progress_bar(loader):
        site = site[0]
        row_id = row_id[0]
        if site in {"site_1", "site_2"}:
            image = image.to(device)
            image = image.to('cpu').detach().numpy().copy()
            proba = 0.0
            for model in models:
                proba += model.predict(image).reshape(-1)/len(models)

        else:
            # to avoid prediction on large batch
            image = image.squeeze(0)
            batch_size = 16
            whole_size = image.size(0)
            if whole_size % batch_size == 0:
                n_iter = whole_size // batch_size
            else:
                n_iter = whole_size // batch_size + 1
                
            all_events = set()
            for batch_i in range(n_iter):
                batch = image[batch_i * batch_size:(batch_i + 1) * batch_size]
                if batch.ndim == 3:
                    batch = batch.unsqueeze(0)

                batch = batch.to(device)
                batch = batch.to('cpu').detach().numpy().copy()
                proba = 0.0
                for model in models:
                    proba += model.predict(batch)/len(models)
            
        prediction_dict[row_id] = proba
    return prediction_dict

In [27]:
def prediction_tf(test_df: pd.DataFrame,
                  test_audio: Path,
                  model_config: dict,
                  mel_params: dict,
                  weights_path: str,
                  threshold=0.5):
    #model = get_model(model_config, weights_path)
    unique_audio_id = test_df.audio_id.unique()

    warnings.filterwarnings("ignore")
    prediction_dfs = []
    for audio_id in unique_audio_id:
        with timer(f"Loading {audio_id}", logger):
            clip, _ = librosa.load(test_audio / (audio_id + ".mp3"),
                                   sr=TARGET_SR,
                                   mono=True,
                                   res_type="kaiser_fast")
        
        test_df_for_audio_id = test_df.query(
            f"audio_id == '{audio_id}'").reset_index(drop=True)
        with timer(f"Prediction on {audio_id}", logger):
            prediction_dict = prediction_for_clip_tf(test_df_for_audio_id,
                                                     clip=clip,
                                                     mel_params=mel_params,
                                                     threshold=threshold)
        row_id = list(prediction_dict.keys())
        birds = list(prediction_dict.values())
        prediction_df = pd.DataFrame({
            "row_id": row_id,
            "birds": birds
        })
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [28]:
model_config = {
    "base_model_name": "efficientnet",
    "pretrained": False,
    "num_classes": 264
}

melspectrogram_parameters = {
    "n_mels": 128,
    "fmin": 20,
    "fmax": 16000
}

In [29]:
submission_tf = prediction_tf(test_df=test_tf,
                           test_audio=test_audio,
                           model_config=model_config,
                           mel_params=melspectrogram_parameters,
                           weights_path=None,
                           threshold=0.75)

2020-09-14 23:32:11,619 - INFO - [Loading 41e6fe6504a34bf6846938ba78d13df1] start
2020-09-14 23:32:12,271 - INFO - [Loading 41e6fe6504a34bf6846938ba78d13df1] done in 0.65 s
2020-09-14 23:32:12,281 - INFO - [Prediction on 41e6fe6504a34bf6846938ba78d13df1] start


2020-09-14 23:32:30,871 - INFO - [Prediction on 41e6fe6504a34bf6846938ba78d13df1] done in 18.59 s
2020-09-14 23:32:30,875 - INFO - [Loading cce64fffafed40f2b2f3d3413ec1c4c2] start
2020-09-14 23:32:31,769 - INFO - [Loading cce64fffafed40f2b2f3d3413ec1c4c2] done in 0.89 s
2020-09-14 23:32:31,776 - INFO - [Prediction on cce64fffafed40f2b2f3d3413ec1c4c2] start


2020-09-14 23:32:33,529 - INFO - [Prediction on cce64fffafed40f2b2f3d3413ec1c4c2] done in 1.75 s
2020-09-14 23:32:33,531 - INFO - [Loading 99af324c881246949408c0b1ae54271f] start
2020-09-14 23:32:34,453 - INFO - [Loading 99af324c881246949408c0b1ae54271f] done in 0.92 s
2020-09-14 23:32:34,460 - INFO - [Prediction on 99af324c881246949408c0b1ae54271f] start


2020-09-14 23:32:35,944 - INFO - [Prediction on 99af324c881246949408c0b1ae54271f] done in 1.48 s
2020-09-14 23:32:35,950 - INFO - [Loading 6ab74e177aa149468a39ca10beed6222] start
2020-09-14 23:32:36,798 - INFO - [Loading 6ab74e177aa149468a39ca10beed6222] done in 0.85 s
2020-09-14 23:32:36,807 - INFO - [Prediction on 6ab74e177aa149468a39ca10beed6222] start


2020-09-14 23:32:38,057 - INFO - [Prediction on 6ab74e177aa149468a39ca10beed6222] done in 1.25 s
2020-09-14 23:32:38,059 - INFO - [Loading b2fd3f01e9284293a1e33f9c811a2ed6] start
2020-09-14 23:32:38,925 - INFO - [Loading b2fd3f01e9284293a1e33f9c811a2ed6] done in 0.87 s
2020-09-14 23:32:38,934 - INFO - [Prediction on b2fd3f01e9284293a1e33f9c811a2ed6] start


2020-09-14 23:32:40,389 - INFO - [Prediction on b2fd3f01e9284293a1e33f9c811a2ed6] done in 1.45 s
2020-09-14 23:32:40,394 - INFO - [Loading de62b37ebba749d2abf29d4a493ea5d4] start
2020-09-14 23:32:40,890 - INFO - [Loading de62b37ebba749d2abf29d4a493ea5d4] done in 0.50 s
2020-09-14 23:32:40,897 - INFO - [Prediction on de62b37ebba749d2abf29d4a493ea5d4] start


2020-09-14 23:32:41,127 - INFO - [Prediction on de62b37ebba749d2abf29d4a493ea5d4] done in 0.23 s
2020-09-14 23:32:41,132 - INFO - [Loading 8680a8dd845d40f296246dbed0d37394] start
2020-09-14 23:32:42,136 - INFO - [Loading 8680a8dd845d40f296246dbed0d37394] done in 1.00 s
2020-09-14 23:32:42,145 - INFO - [Prediction on 8680a8dd845d40f296246dbed0d37394] start


2020-09-14 23:32:44,245 - INFO - [Prediction on 8680a8dd845d40f296246dbed0d37394] done in 2.10 s
2020-09-14 23:32:44,247 - INFO - [Loading 940d546e5eb745c9a74bce3f35efa1f9] start
2020-09-14 23:32:45,615 - INFO - [Loading 940d546e5eb745c9a74bce3f35efa1f9] done in 1.37 s
2020-09-14 23:32:45,622 - INFO - [Prediction on 940d546e5eb745c9a74bce3f35efa1f9] start


2020-09-14 23:32:48,492 - INFO - [Prediction on 940d546e5eb745c9a74bce3f35efa1f9] done in 2.87 s
2020-09-14 23:32:48,498 - INFO - [Loading 07ab324c602e4afab65ddbcc746c31b5] start
2020-09-14 23:32:49,271 - INFO - [Loading 07ab324c602e4afab65ddbcc746c31b5] done in 0.77 s
2020-09-14 23:32:49,288 - INFO - [Prediction on 07ab324c602e4afab65ddbcc746c31b5] start


2020-09-14 23:32:50,491 - INFO - [Prediction on 07ab324c602e4afab65ddbcc746c31b5] done in 1.20 s
2020-09-14 23:32:50,493 - INFO - [Loading 899616723a32409c996f6f3441646c2a] start
2020-09-14 23:32:51,542 - INFO - [Loading 899616723a32409c996f6f3441646c2a] done in 1.05 s
2020-09-14 23:32:51,552 - INFO - [Prediction on 899616723a32409c996f6f3441646c2a] start


2020-09-14 23:32:53,608 - INFO - [Prediction on 899616723a32409c996f6f3441646c2a] done in 2.06 s
2020-09-14 23:32:53,611 - INFO - [Loading 9cc5d9646f344f1bbb52640a988fe902] start
2020-09-14 23:32:58,020 - INFO - [Loading 9cc5d9646f344f1bbb52640a988fe902] done in 4.41 s
2020-09-14 23:32:58,028 - INFO - [Prediction on 9cc5d9646f344f1bbb52640a988fe902] start


2020-09-14 23:33:00,908 - INFO - [Prediction on 9cc5d9646f344f1bbb52640a988fe902] done in 2.88 s
2020-09-14 23:33:00,911 - INFO - [Loading a56e20a518684688a9952add8a9d5213] start
2020-09-14 23:33:01,745 - INFO - [Loading a56e20a518684688a9952add8a9d5213] done in 0.83 s
2020-09-14 23:33:01,753 - INFO - [Prediction on a56e20a518684688a9952add8a9d5213] start


2020-09-14 23:33:02,418 - INFO - [Prediction on a56e20a518684688a9952add8a9d5213] done in 0.66 s
2020-09-14 23:33:02,424 - INFO - [Loading 96779836288745728306903d54e264dd] start
2020-09-14 23:33:03,088 - INFO - [Loading 96779836288745728306903d54e264dd] done in 0.66 s
2020-09-14 23:33:03,096 - INFO - [Prediction on 96779836288745728306903d54e264dd] start


2020-09-14 23:33:03,697 - INFO - [Prediction on 96779836288745728306903d54e264dd] done in 0.60 s
2020-09-14 23:33:03,701 - INFO - [Loading f77783ba4c6641bc918b034a18c23e53] start
2020-09-14 23:33:04,230 - INFO - [Loading f77783ba4c6641bc918b034a18c23e53] done in 0.53 s
2020-09-14 23:33:04,240 - INFO - [Prediction on f77783ba4c6641bc918b034a18c23e53] start


2020-09-14 23:33:04,475 - INFO - [Prediction on f77783ba4c6641bc918b034a18c23e53] done in 0.24 s
2020-09-14 23:33:04,481 - INFO - [Loading 856b194b097441958697c2bcd1f63982] start
2020-09-14 23:33:05,293 - INFO - [Loading 856b194b097441958697c2bcd1f63982] done in 0.81 s
2020-09-14 23:33:05,310 - INFO - [Prediction on 856b194b097441958697c2bcd1f63982] start


2020-09-14 23:33:05,765 - INFO - [Prediction on 856b194b097441958697c2bcd1f63982] done in 0.46 s


## Merging 2 sub

In [30]:
combined_submission = submission_torch.merge(submission_tf, on = 'row_id', how = 'right')
combined_submission.head()

Unnamed: 0,row_id,birds_x,birds_y
0,site_1_41e6fe6504a34bf6846938ba78d13df1_5,"[0.56462324, 0.003785146, 0.003734492, 0.00089...","[0.8932539, 0.028735187, 0.023875736, 0.024248..."
1,site_1_41e6fe6504a34bf6846938ba78d13df1_10,"[0.7370924, 0.002508626, 0.0013510796, 0.00079...","[0.82841647, 0.027257076, 0.025744388, 0.02424..."
2,site_1_41e6fe6504a34bf6846938ba78d13df1_15,"[0.7645668, 0.0019941668, 0.0020704062, 0.0007...","[0.84920436, 0.027776375, 0.025984261, 0.02615..."
3,site_1_41e6fe6504a34bf6846938ba78d13df1_20,"[0.008946858, 0.0007821293, 0.0015165815, 0.00...","[0.26875567, 0.026624205, 0.029162815, 0.02495..."
4,site_1_41e6fe6504a34bf6846938ba78d13df1_25,"[0.64669853, 0.006563817, 0.0013686274, 0.0009...","[0.9369532, 0.026381966, 0.025555395, 0.025422..."


In [31]:
torch_weight = 0.85
tf_weight = 0.15
combine_threshold = 0.58
threshold_torch = 0.3
threshold_tf = 0.35
prediction_dict = {}


for index, row in combined_submission.iterrows():
    
    row_id = row['row_id']
    if ('site_1' in row_id) or ('site_2' in row_id):
        proba = row['birds_x']*torch_weight + row['birds_y']*tf_weight
        events = proba >= combine_threshold
        labels = np.argwhere(events).reshape(-1).tolist()
    else:
        all_events = set()
        
        proba = row['birds_x']
        events = proba >= threshold_torch
        for i in range(len(events)):
            event = events[i, :]
            labels = np.argwhere(event).reshape(-1).tolist()
            for label in labels:
                all_events.add(label)
        proba = row['birds_y']
        events = proba >= threshold_tf
        for i in range(len(events)):
            event = events[i, :]
            labels = np.argwhere(event).reshape(-1).tolist()
            for label in labels:
                all_events.add(label)      
                
        labels = list(all_events)
        
        
    if len(labels) == 0:
        prediction_dict[row_id] = "nocall"
    else:
        labels_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
        label_string = " ".join(labels_str_list)
        prediction_dict[row_id] = label_string 
        
row_id = list(prediction_dict.keys())
birds = list(prediction_dict.values())
submission = pd.DataFrame({
            "row_id": row_id,
            "birds": birds
        })

In [32]:
submission.to_csv("submission.csv", index=False)
submission

Unnamed: 0,row_id,birds
0,site_1_41e6fe6504a34bf6846938ba78d13df1_5,aldfly
1,site_1_41e6fe6504a34bf6846938ba78d13df1_10,aldfly
2,site_1_41e6fe6504a34bf6846938ba78d13df1_15,aldfly
3,site_1_41e6fe6504a34bf6846938ba78d13df1_20,nocall
4,site_1_41e6fe6504a34bf6846938ba78d13df1_25,aldfly
5,site_1_cce64fffafed40f2b2f3d3413ec1c4c2_5,aldfly
6,site_1_cce64fffafed40f2b2f3d3413ec1c4c2_10,nocall
7,site_1_cce64fffafed40f2b2f3d3413ec1c4c2_15,nocall
8,site_1_cce64fffafed40f2b2f3d3413ec1c4c2_20,nocall
9,site_1_cce64fffafed40f2b2f3d3413ec1c4c2_25,nocall


In [33]:
submission['birds'].value_counts()

aldfly           37
nocall           34
aldfly hamfly     2
amerob            1
aldfly horlar     1
btnwar            1
Name: birds, dtype: int64