In [1]:
import os
import shutil
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import torchaudio

warnings.filterwarnings(action="ignore")
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import json

import geopandas
import librosa
import librosa.display
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import soundfile
import tensorflow as tf
import tensorflow_addons as tfa
import torch

%matplotlib inline
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from torchaudio.transforms import MelSpectrogram
from tqdm import tqdm

from src.MEL_Gen import MEL_Gen, Mel_Provider
from src.prepare_dataset import (
    choose_ids,
    make_dict_birds,
    make_intervals,
    make_intervals_upsampling,
    prepare_dataset,
)

In [2]:
SEED = 42
IMG_SIZE = 224
SAMPLE_RATE = 32000
N_FFT = 2048
SIGNAL_LENGTH = 5
FREQ_MIN = 0
FREQ_MAX = 16000
WIN_LENGHT = 1024
BATCH_SIZE = 50
list_drop = [
    "XC509721.ogg",
    "XC428067.ogg",
    "XC523831.ogg",
    "XC523960.ogg",
    "XC237870.ogg",
    "XC129924.ogg",
    "XC576851.ogg",
    "XC579430.ogg",
    "XC590621.ogg",
]

In [3]:
final = pd.read_csv("/app/_data/labels_nocall/nocall_predictions.csv")
nocall = pd.read_csv("/app/_data/labels_nocall/all_nocalls.csv")

In [4]:
all_audio_init = pd.read_csv("/app/_data/all_audio_initial.csv")
all_audio = all_audio_init.query("filename not in @list_drop").reset_index(drop=True)
all_audio = all_audio.query("year >=1980 and year<=2021").reset_index(drop=True)

all_audio.shape

(65128, 24)

# dict_birds and weights

In [5]:
dict_birds, all_audio = make_dict_birds(all_audio)

In [6]:
len(dict_birds)

398

In [7]:
dict_weights = (1 / all_audio["primary_label"].value_counts()).to_dict()
all_audio["class_weights"] = all_audio["primary_label"].replace(dict_weights)
all_audio["class_weights"] = all_audio["class_weights"] * all_audio["rating"]

## calls

In [8]:
drop_list_final = final[(final["bird"] == 1) & (final["nocall"] == 1)][
    "row_id"
].tolist()

In [9]:
final = final.query("row_id not in @drop_list_final")
final = final.query("bird == 1")

In [10]:
final = final.merge(
    all_audio[["filename", "primary_label", "class_weights"]], on="filename", how="left"
)

In [11]:
final_sample = final.sample(400000, weights="class_weights", random_state=SEED)

In [12]:
final_sample.sample()

Unnamed: 0,nocall,bird,row_id,filename,end_sec,primary_label,class_weights
1279190,0,1,XC117661_169_0,XC117661.ogg,169.0,kebtou1,0.078125


In [13]:
final_sample = final_sample[
    ["row_id", "filename", "end_sec", "primary_label", "class_weights"]
].merge(
    all_audio[
        [
            "filename",
            "secondary_labels",
            "label_id",
            "secondary_labels_id",
            "duration",
            "rating",
            "year",
            "file_path",
            "sin_month",
            "cos_month",
            "sin_longitude",
            "cos_longitude",
            "latitude",
            "norm_latitude",
        ]
    ],
    on="filename",
    how="left",
)

In [14]:
final_sample

Unnamed: 0,row_id,filename,end_sec,primary_label,class_weights,secondary_labels,label_id,secondary_labels_id,duration,rating,year,file_path,sin_month,cos_month,sin_longitude,cos_longitude,latitude,norm_latitude
0,XC423534_13_0,XC423534.ogg,13.0,gcrwar,0.027624,,143,,18.796219,5.0,2018,/app/_data/train_short_audio/gcrwar/XC423534.ogg,1.224647e-16,-1.000000e+00,-0.754768,0.655992,-14.6250,0.418750
1,XC375498_428_0,XC375498.ogg,428.0,woothr,0.015284,yebcha spotow bkbmag1 wesmea cliswa,379,382 319 33 357 100,504.804375,3.5,2017,/app/_data/train_short_audio/woothr/XC375498.ogg,1.224647e-16,-1.000000e+00,-0.965118,-0.261814,40.5886,0.725492
2,XC478859_226_0,XC478859.ogg,226.0,rudtur,0.035714,sander,290,298,1976.908844,4.5,2018,/app/_data/train_short_audio/rudtur/XC478859.ogg,-8.660254e-01,-5.000000e-01,-0.385751,0.922603,64.0818,0.856010
3,XC63429_19_0,XC63429.ogg,19.0,orbeup1,0.027027,,234,,24.726000,4.0,2001,/app/_data/train_short_audio/orbeup1/XC63429.ogg,-8.660254e-01,5.000000e-01,-0.948636,0.316371,-13.0501,0.427499
4,XC269518_504_0,XC269518.ogg,504.0,brnthr,0.030201,rewbla,60,271,596.749000,4.5,2015,/app/_data/train_short_audio/brnthr/XC269518.ogg,-5.000000e-01,-8.660254e-01,-0.987984,0.154553,32.1779,0.678766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,XC451992_45_0,XC451992.ogg,45.0,trokin,0.017361,,341,,59.991312,5.0,2019,/app/_data/train_short_audio/trokin/XC451992.ogg,5.000000e-01,8.660254e-01,-0.642496,0.766289,-7.2395,0.459781
399996,XC614092_188_0,XC614092.ogg,188.0,wiltur,0.050000,moudov whwdov,376,214 372,304.645000,3.5,2020,/app/_data/train_short_audio/wiltur/XC614092.ogg,8.660254e-01,-5.000000e-01,-0.944638,-0.328113,31.9060,0.677256
399997,XC353210_14_0,XC353210.ogg,14.0,whfpar1,0.044304,whwdov normoc houspa houfin,366,372 223 179 178,384.719469,3.5,2016,/app/_data/train_short_audio/whfpar1/XC353210.ogg,1.000000e+00,6.123234e-17,-0.945887,-0.324497,27.0149,0.650083
399998,XC161891_6_0,XC161891.ogg,6.0,coohaw,0.027523,,114,,26.371719,3.0,2010,/app/_data/train_short_audio/coohaw/XC161891.ogg,-5.000000e-01,-8.660254e-01,-0.960682,0.277652,41.2004,0.728891


In [15]:
final_sample['primary_label'].value_counts()

normoc     3520
yelgro     2924
bulori     2721
brnthr     2714
cubthr     2412
           ... 
gretin1     186
whcpar      160
wegspa1     133
hofwoo1     130
stvhum2      52
Name: primary_label, Length: 397, dtype: int64

# nocall

In [16]:
nocall = nocall.drop_duplicates()

In [17]:
all_audio["class_weights"] = all_audio["class_weights"].astype("float32")

In [18]:
nocall = nocall.merge(
    all_audio[
        [
            "filename",
            "rating",
            "year",
            "sin_month",
            "cos_month",
            "sin_longitude",
            "cos_longitude",
            "latitude",
            "norm_latitude",
        ]
    ],
    on="filename",
    how="left",
).drop_duplicates()

In [19]:
nocall["label_id"] = dict_birds["nocall"]
nocall["secondary_labels_id"] = np.nan

In [20]:
nocall["class_weights"] = all_audio["class_weights"].median()

In [21]:
nocall['file_path'] = nocall['file_path'].str.replace('birdclef-2021/', '')

In [22]:
nocall_sample = nocall.sample(20000, random_state=SEED)

# concat

In [23]:
final_audio = (
    pd.concat(
        [final_sample, nocall_sample],
        axis=0,
        ignore_index=True,
    )
    .sample(frac=1, random_state=SEED)
    .reset_index(drop=True)
)
final_audio.shape



(420000, 20)

In [24]:
final_audio["primary_label"].value_counts().tail(20)

norsho     387
grhcha1    386
leasan     383
sthwoo1    372
bkbplo     362
rebsap     353
whimbr     332
goowoo1    330
lotduc     304
sander     289
rehbar1    259
runwre1    236
rthhum     219
heptan     210
bucmot2    202
gretin1    186
whcpar     160
wegspa1    133
hofwoo1    130
stvhum2     52
Name: primary_label, dtype: int64

In [25]:
train, valid = train_test_split(final_audio, train_size=0.8, random_state=SEED)
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

# MEL_Gen

In [26]:
NUM_CLASSES = len(dict_birds)
BATCH_SIZE = 410

In [27]:
mel_pr = Mel_Provider(
    n_fft=N_FFT,
    win_length=WIN_LENGHT,
    n_mels=IMG_SIZE,
    sample_rate=SAMPLE_RATE,
    mel_image_size=IMG_SIZE,
    min_frequency=FREQ_MIN,
    max_frequency=FREQ_MAX,
    signal_lenght=SIGNAL_LENGTH,
)

In [28]:
# shutil.rmtree('/app/_data/npy/short_mels')

In [29]:
gen_train = MEL_Gen(
    df=train,
    n_mels=IMG_SIZE,
    seed=SEED,
    sample_rate=SAMPLE_RATE,
    mel_image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    n_classes=NUM_CLASSES,
    signal_lenght=SIGNAL_LENGTH,
    mel_provider=mel_pr,
    wave_dir="/app/_data/npy/waves_npy/",
    short_mel_dir="/app/_data/npy/short_mels_224_uint8/",
    convert_to_rgb=True,
    shuffle=True,
    return_primary_labels=False,
    return_concat_labels=True,
    augment=False,
    img_dtype="uint8",
)
gen_valid = MEL_Gen(
    df=valid,
    n_mels=IMG_SIZE,
    seed=SEED,
    sample_rate=SAMPLE_RATE,
    mel_image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    n_classes=NUM_CLASSES,
    signal_lenght=SIGNAL_LENGTH,
    mel_provider=mel_pr,
    wave_dir="/app/_data/npy/waves_npy/",
    short_mel_dir="/app/_data/npy/short_mels_224_uint8/",
    convert_to_rgb=True,
    shuffle=False,
    return_primary_labels=False,
    return_concat_labels=True,
    augment=False,
    img_dtype="uint8",
)

# model

In [30]:
def get_model():
    inputs = keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    base_model = keras.applications.EfficientNetB0(
        weights="imagenet", include_top=False
    )
    x = base_model(inputs)
    x = keras.layers.GlobalAveragePooling2D(name="avg_pool")(x)
    x = keras.layers.Flatten(name="flatten")(x)
    outputs = keras.layers.Dense(NUM_CLASSES, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(lr=0.0005),
        metrics=[
            "acc",
            keras.metrics.Recall(),
            keras.metrics.Precision(),
            tfa.metrics.F1Score(num_classes=NUM_CLASSES, average="micro"),
        ],
    )
    return model

policy = keras.mixed_precision.experimental.Policy("mixed_float16")
keras.mixed_precision.experimental.set_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3090, compute capability 8.6


In [31]:
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_f1_score",
        patience=12,
        restore_best_weights=True,
        verbose=1,
        mode="max",
    ),
    keras.callbacks.ModelCheckpoint(
        "/app/_data/models/from_nocall/Eff0_1/Eff0_1.h5",
        monitor="val_f1_score",
        verbose=1,
        save_best_only=True,
        save_weights_only=False,
        mode="max",
        save_freq="epoch",
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_f1_score",
        factor=0.9,
        patience=5,
        verbose=1,
        mode="max",
        min_delta=1e-4,
        min_lr=0.00000001,
    ),
    keras.callbacks.TensorBoard(log_dir="/app/.tensorboard/Eff0_1_nc/", histogram_freq=0),
    keras.callbacks.experimental.BackupAndRestore(
        "/app/_data/models/from_nocall/Eff0_1/backup/"
    ),
    keras.callbacks.TerminateOnNaN(),
]

In [None]:
model = get_model()
history = model.fit(
    gen_train,
    validation_data=gen_valid,
    epochs=100,
    steps_per_epoch=train.shape[0] // BATCH_SIZE,
    validation_steps=valid.shape[0] // BATCH_SIZE,
    verbose=1,
    workers=20,
    max_queue_size=50,
    callbacks=callbacks,
)

Epoch 1/100
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_f1_score improved from -inf to 0.24712, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 2/100
Epoch 00002: val_f1_score improved from 0.24712 to 0.52181, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 3/100
Epoch 00003: val_f1_score improved from 0.52181 to 0.59953, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 4/100
Epoch 00004: val_f1_score improved from 0.59953 to 0.63013, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 5/100
Epoch 00005: val_f1_score improved from 0.63013 to 0.65345, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 6/100
Epoch 00006: val_f1_score improved from 0.65345 to 0.66923, saving model to /app/_data/models/from_nocall/Eff0_1/Eff0_1.h5
Epoch 7/100
Epoch 00007: val_f1_score improved from 0.66923 to 0.67755, saving model to /app/_data/models/from_nocall/Eff