In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import librosa
import librosa.display
import numpy as np
import soundfile
import pandas as pd
import joblib
import tensorflow as tf
import tensorflow.keras as keras


import random
import os

## Parameters
Let's introduce a few convenience variables.

In [2]:
SR = 32_000
DURATION = 5
SEED = 42
EPSILON = 10**(-6)

In [3]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    tf.random.set_seed(seed)

seed_everything()

In [4]:
PATH_DATASET = Path.home() / "datasets/kaggle/birdclef-2021"
PATH_DATASET.exists()

True

In [5]:
L_birds = [path.name for path
           in (PATH_DATASET / "train_short_audio").iterdir()]
L_birds[:10]

['acafly',
 'acowoo',
 'aldfly',
 'ameavo',
 'amecro',
 'amegfi',
 'amekes',
 'amepip',
 'amered',
 'amerob']

In [6]:
sorted(L_birds) == L_birds

True

In [7]:
# L_birds will sometimes sorted, sometimes not, so assign just in case.
L_birds = sorted(L_birds) 

- In `01.ipynb`, `L_birds` does not need to be sorted, well, at least it seems.
- Here, in `03.ipynb`, it needs to be sorted.

**Interesting fact?**: Maybe it's due to **seeding**?

In [8]:
D_label_index = {label: i for i, label in enumerate(L_birds)}
D_label_index

{'acafly': 0,
 'acowoo': 1,
 'aldfly': 2,
 'ameavo': 3,
 'amecro': 4,
 'amegfi': 5,
 'amekes': 6,
 'amepip': 7,
 'amered': 8,
 'amerob': 9,
 'amewig': 10,
 'amtspa': 11,
 'andsol1': 12,
 'annhum': 13,
 'astfly': 14,
 'azaspi1': 15,
 'babwar': 16,
 'baleag': 17,
 'balori': 18,
 'banana': 19,
 'banswa': 20,
 'banwre1': 21,
 'barant1': 22,
 'barswa': 23,
 'batpig1': 24,
 'bawswa1': 25,
 'bawwar': 26,
 'baywre1': 27,
 'bbwduc': 28,
 'bcnher': 29,
 'belkin1': 30,
 'belvir': 31,
 'bewwre': 32,
 'bkbmag1': 33,
 'bkbplo': 34,
 'bkbwar': 35,
 'bkcchi': 36,
 'bkhgro': 37,
 'bkmtou1': 38,
 'bknsti': 39,
 'blbgra1': 40,
 'blbthr1': 41,
 'blcjay1': 42,
 'blctan1': 43,
 'blhpar1': 44,
 'blkpho': 45,
 'blsspa1': 46,
 'blugrb1': 47,
 'blujay': 48,
 'bncfly': 49,
 'bnhcow': 50,
 'bobfly1': 51,
 'bongul': 52,
 'botgra': 53,
 'brbmot1': 54,
 'brbsol1': 55,
 'brcvir1': 56,
 'brebla': 57,
 'brncre': 58,
 'brnjay': 59,
 'brnthr': 60,
 'brratt1': 61,
 'brwhaw': 62,
 'brwpar1': 63,
 'btbwar': 64,
 'btnwar': 6

In [9]:
D_index_label = {v: k for k, v in D_label_index.items()}
D_index_label

{0: 'acafly',
 1: 'acowoo',
 2: 'aldfly',
 3: 'ameavo',
 4: 'amecro',
 5: 'amegfi',
 6: 'amekes',
 7: 'amepip',
 8: 'amered',
 9: 'amerob',
 10: 'amewig',
 11: 'amtspa',
 12: 'andsol1',
 13: 'annhum',
 14: 'astfly',
 15: 'azaspi1',
 16: 'babwar',
 17: 'baleag',
 18: 'balori',
 19: 'banana',
 20: 'banswa',
 21: 'banwre1',
 22: 'barant1',
 23: 'barswa',
 24: 'batpig1',
 25: 'bawswa1',
 26: 'bawwar',
 27: 'baywre1',
 28: 'bbwduc',
 29: 'bcnher',
 30: 'belkin1',
 31: 'belvir',
 32: 'bewwre',
 33: 'bkbmag1',
 34: 'bkbplo',
 35: 'bkbwar',
 36: 'bkcchi',
 37: 'bkhgro',
 38: 'bkmtou1',
 39: 'bknsti',
 40: 'blbgra1',
 41: 'blbthr1',
 42: 'blcjay1',
 43: 'blctan1',
 44: 'blhpar1',
 45: 'blkpho',
 46: 'blsspa1',
 47: 'blugrb1',
 48: 'blujay',
 49: 'bncfly',
 50: 'bnhcow',
 51: 'bobfly1',
 52: 'bongul',
 53: 'botgra',
 54: 'brbmot1',
 55: 'brbsol1',
 56: 'brcvir1',
 57: 'brebla',
 58: 'brncre',
 59: 'brnjay',
 60: 'brnthr',
 61: 'brratt1',
 62: 'brwhaw',
 63: 'brwpar1',
 64: 'btbwar',
 65: 'btnwar

In [10]:
df_train_soundscape = pd.read_csv("./rich_train_soundscape_labels.csv")
df_train_soundscape.head()

Unnamed: 0,row_id,site,audio_id,seconds,birds,is_test,year,month,day,longitude,latitude,npy_parent,npy_path
0,7019_COR_5,COR,7019,5,nocall,True,2019,9,4,-84.51,10.12,testSoundScapes,testSoundScapes/7019_COR_5.npy
1,7019_COR_10,COR,7019,10,nocall,True,2019,9,4,-84.51,10.12,testSoundScapes,testSoundScapes/7019_COR_10.npy
2,7019_COR_15,COR,7019,15,nocall,True,2019,9,4,-84.51,10.12,testSoundScapes,testSoundScapes/7019_COR_15.npy
3,7019_COR_20,COR,7019,20,nocall,True,2019,9,4,-84.51,10.12,testSoundScapes,testSoundScapes/7019_COR_20.npy
4,7019_COR_25,COR,7019,25,nocall,True,2019,9,4,-84.51,10.12,testSoundScapes,testSoundScapes/7019_COR_25.npy


In [11]:
L_useful_features = [
    #"longitude",
    "latitude",
    "month",
    "day",
    "is_test",
]
df_useful_features = df_train_soundscape.loc[:, L_useful_features]
df_useful_features

Unnamed: 0,latitude,month,day,is_test
0,10.12,9,4,True
1,10.12,9,4,True
2,10.12,9,4,True
3,10.12,9,4,True
4,10.12,9,4,True
...,...,...,...,...
2395,42.47,6,17,True
2396,42.47,6,17,True
2397,42.47,6,17,True
2398,42.47,6,17,True


## Dataset (Features)

In [12]:
is_train = df_train_soundscape["is_test"] == False

In [13]:
(is_train == ~df_train_soundscape["is_test"]).all()

True

In [14]:
(False == ~df_train_soundscape["is_test"]).all()

False

In [15]:
is_train = ~df_train_soundscape["is_test"]

In [16]:
df_useful_features["month"].unique()

array([ 9, 10,  7,  3,  4,  8,  5,  6])

In [17]:
df_useful_features["day"].unique()

array([ 4, 23,  1, 25, 29,  5,  8, 13, 17])

In [18]:
df_useful_features["month_normalized"] = (df_useful_features["month"] - 6.5) / 12
df_useful_features["day_normalized"] = (df_useful_features["day"] - 16) / 31
df_useful_features["latitude_normalized"] = df_useful_features["latitude"] / 90

df_useful_features.loc[:,
    ["month", "month_normalized",
     "day", "day_normalized",
     "latitude", "latitude_normalized"]]

Unnamed: 0,month,month_normalized,day,day_normalized,latitude,latitude_normalized
0,9,0.208333,4,-0.387097,10.12,0.112444
1,9,0.208333,4,-0.387097,10.12,0.112444
2,9,0.208333,4,-0.387097,10.12,0.112444
3,9,0.208333,4,-0.387097,10.12,0.112444
4,9,0.208333,4,-0.387097,10.12,0.112444
...,...,...,...,...,...,...
2395,6,-0.041667,17,0.032258,42.47,0.471889
2396,6,-0.041667,17,0.032258,42.47,0.471889
2397,6,-0.041667,17,0.032258,42.47,0.471889
2398,6,-0.041667,17,0.032258,42.47,0.471889


In [19]:
XX_train_normalized = df_useful_features.loc[is_train, ["latitude_normalized", "month_normalized", "day_normalized"]]
XX_test_normalized = df_useful_features.loc[~is_train, ["latitude_normalized", "month_normalized", "day_normalized"]]
XX_train_normalized.shape, XX_test_normalized.shape

((1920, 3), (480, 3))

In [20]:
XX_train_normalized

Unnamed: 0,latitude_normalized,month_normalized,day_normalized
120,0.112444,0.208333,0.225806
121,0.112444,0.208333,0.225806
122,0.112444,0.208333,0.225806
123,0.112444,0.208333,0.225806
124,0.112444,0.208333,0.225806
...,...,...,...
2275,0.471889,-0.125000,-0.096774
2276,0.471889,-0.125000,-0.096774
2277,0.471889,-0.125000,-0.096774
2278,0.471889,-0.125000,-0.096774


In [21]:
XX_test_normalized

Unnamed: 0,latitude_normalized,month_normalized,day_normalized
0,0.112444,0.208333,-0.387097
1,0.112444,0.208333,-0.387097
2,0.112444,0.208333,-0.387097
3,0.112444,0.208333,-0.387097
4,0.112444,0.208333,-0.387097
...,...,...,...
2395,0.471889,-0.041667,0.032258
2396,0.471889,-0.041667,0.032258
2397,0.471889,-0.041667,0.032258
2398,0.471889,-0.041667,0.032258


In [54]:
XX_train_normalized = XX_train_normalized.values

## Dataset (Images)
**(?)** How large can the training dataset be if we load it entirely into RAM?<br>
**(R)** Let's say

- the `dtype` will be `float32`
- each mels will be of shape `(128, 201)`
- there will be `1920` mels

In [22]:
df_train_soundscape.loc[is_train].shape

(1920, 13)

In [24]:
random_npy = np.load(df_train_soundscape.loc[0, "npy_path"])
random_npy.dtype, random_npy.shape

(dtype('uint8'), (128, 201))

In [25]:
4 * np.product(random_npy.shape) * df_train_soundscape.loc[is_train].shape[0]

197591040

In [26]:
n_bytes = _
print(f"That's {n_bytes:,} bytes.")

That's 197,591,040 bytes.


In [27]:
n_MB = n_bytes // (2**20)
print(f"That's {n_MB:,} MB.")

That's 188 MB.


This is relatively small memory consumption. Even when we make them into 3-channel images, that is only less than `600 MB`. I think **we can afford caching it in RAM**.

**(?)** In `kkiller`'s code, the `.npy` are saved in `dtype=uint8` and only before going into model is the `dtype` converted again to `float32`.
Why is that?<br>
**(R)** I guess it is because `kkiller` wants to minimize the storage and reload of `.npy` files.

In [28]:
n_train_instances = df_train_soundscape.loc[is_train].shape[0]
X_train = np.empty((n_train_instances, *random_npy.shape, 3), dtype=np.float32)
X_train.shape

(1920, 128, 201, 3)

In [29]:
for i, row in enumerate(df_train_soundscape.loc[is_train]):
    print(i)
    print(row)

0
row_id
1
site
2
audio_id
3
seconds
4
birds
5
is_test
6
year
7
month
8
day
9
longitude
10
latitude
11
npy_parent
12
npy_path


In [30]:
[s for s in dir(df_train_soundscape) if s.startswith("iter")]

['iteritems', 'iterrows', 'itertuples']

In [31]:
# index from 0? No!
df_train_soundscape[is_train].head()

Unnamed: 0,row_id,site,audio_id,seconds,birds,is_test,year,month,day,longitude,latitude,npy_parent,npy_path
120,7954_COR_5,COR,7954,5,nocall,False,2019,9,23,-84.51,10.12,trainSoundScapes,trainSoundScapes/7954_COR_5.npy
121,7954_COR_10,COR,7954,10,nocall,False,2019,9,23,-84.51,10.12,trainSoundScapes,trainSoundScapes/7954_COR_10.npy
122,7954_COR_15,COR,7954,15,nocall,False,2019,9,23,-84.51,10.12,trainSoundScapes,trainSoundScapes/7954_COR_15.npy
123,7954_COR_20,COR,7954,20,nocall,False,2019,9,23,-84.51,10.12,trainSoundScapes,trainSoundScapes/7954_COR_20.npy
124,7954_COR_25,COR,7954,25,nocall,False,2019,9,23,-84.51,10.12,trainSoundScapes,trainSoundScapes/7954_COR_25.npy


In [32]:
for i, row in enumerate((df_train_soundscape.loc[is_train]).itertuples()):
    if i < 3:
        print(i)
        print(row)
    else:
        break

0
Pandas(Index=120, row_id='7954_COR_5', site='COR', audio_id=7954, seconds=5, birds='nocall', is_test=False, year=2019, month=9, day=23, longitude=-84.51, latitude=10.12, npy_parent='trainSoundScapes', npy_path='trainSoundScapes/7954_COR_5.npy')
1
Pandas(Index=121, row_id='7954_COR_10', site='COR', audio_id=7954, seconds=10, birds='nocall', is_test=False, year=2019, month=9, day=23, longitude=-84.51, latitude=10.12, npy_parent='trainSoundScapes', npy_path='trainSoundScapes/7954_COR_10.npy')
2
Pandas(Index=122, row_id='7954_COR_15', site='COR', audio_id=7954, seconds=15, birds='nocall', is_test=False, year=2019, month=9, day=23, longitude=-84.51, latitude=10.12, npy_parent='trainSoundScapes', npy_path='trainSoundScapes/7954_COR_15.npy')


In [33]:
XX_train_normalized.head(3)

Unnamed: 0,latitude_normalized,month_normalized,day_normalized
120,0.112444,0.208333,0.225806
121,0.112444,0.208333,0.225806
122,0.112444,0.208333,0.225806


**(?)** If `M` is a `(2,2)` ndarray, what do we do if we want three copies of `M` to make up 3 channels like an image?

In [34]:
M = np.array([[1,2], [3,4]])
#np.repeat(M, [2,2])
np.stack([M, M, M])

array([[[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]]])

In [35]:
_.shape

(3, 2, 2)

In [36]:
np.stack([M]*3)

array([[[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]]])

Wrong shape: we want **channel-last**.

In [37]:
M[np.newaxis]

array([[[1, 2],
        [3, 4]]])

In [38]:
np.repeat(M[np.newaxis], 3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4])

In [39]:
np.repeat(M[np.newaxis], 3, axis=0)

array([[[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]],

       [[1, 2],
        [3, 4]]])

In [40]:
_.shape

(3, 2, 2)

In [41]:
MM = np.repeat(M[..., np.newaxis], 3, axis=-1)
MM.shape

(2, 2, 3)

In [42]:
MM[..., 0]

array([[1, 2],
       [3, 4]])

In [43]:
M[np.newaxis]

array([[[1, 2],
        [3, 4]]])

In [44]:
np.repeat(M[np.newaxis], 3, axis=-1)

array([[[1, 1, 1, 2, 2, 2],
        [3, 3, 3, 4, 4, 4]]])

In [45]:
_.shape

(1, 2, 6)

In [46]:
np.repeat(M, 3).reshape((*M.shape, 3))

array([[[1, 1, 1],
        [2, 2, 2]],

       [[3, 3, 3],
        [4, 4, 4]]])

In [47]:
_[..., 0], _[..., -1]

(array([[1, 2],
        [3, 4]]),
 array([[1, 2],
        [3, 4]]))

The above: `reshape` is not broadcast.

In [48]:
# fill in X_train
for i, row in enumerate((df_train_soundscape.loc[is_train]).itertuples()):
    mels_i = np.load(row.npy_path).astype(np.float32, copy=False)
    X_train[i] = np.repeat(mels_i, 3).reshape((*mels_i.shape, 3))
    # The following should generate the same ndarray, and maybe faster or at least clearer
    #X_train[i] = np.repeat(mels_i[..., np.newaxis], 3, axis=-1)
X_train /= 255.0
X_train.max(), X_train.min()

(1.0, 0.0)

In [49]:
def birds_to_ndarry(series):
    I = np.eye(len(D_label_index))
    ndarray = np.zeros((len(series), len(D_label_index)))
    for i, string in enumerate(series.values):
        if string == "nocall":
            continue
        else:
            L_indices = [D_label_index[label] for label in string.split(" ")]
            row_i = np.sum(I[L_indices], axis=0)
            ndarray[i] = row_i
    return ndarray

In [50]:
A = birds_to_ndarry(df_train_soundscape.birds)
A.shape

(2400, 397)

In [51]:
y_train = A[df_train_soundscape.is_test==False]
y_train.shape

(1920, 397)

In [52]:
y_test = A[df_train_soundscape.is_test==True]
y_test.shape

(480, 397)

## Models
- EfficientNet

In [55]:
from tensorflow.keras.applications import EfficientNetB0

In [56]:
(*random_npy.shape, 3)

(128, 201, 3)

In [57]:
input_mels = keras.layers.Input(shape=(*random_npy.shape, 3), name="input_mels")
input_spacetime = keras.layers.Input(shape=(XX_train_normalized.shape[1],),
                                     name="input_spacetime")

output_efficient = EfficientNetB0(include_top=False, weights="imagenet")(input_mels)

In [58]:
[s for s in dir(output_efficient) if not s.startswith("_")]

['dtype',
 'experimental_ref',
 'from_tensor',
 'get_shape',
 'is_tensor_like',
 'name',
 'op',
 'ref',
 'set_shape',
 'shape',
 'type_spec']

In [59]:
output_efficient.type_spec

TensorSpec(shape=(None, 4, 6, 1280), dtype=tf.float32, name=None)

`(4, 6, 1280)` weird shape... How is the structure of EfficientNet?

In [60]:
m = EfficientNetB0(weights='imagenet')
m.summary()

Model: "efficientnetb0"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
rescaling_1 (Rescaling)         (None, 224, 224, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
normalization_1 (Normalization) (None, 224, 224, 3)  7           rescaling_1[0][0]                
__________________________________________________________________________________________________
stem_conv_pad (ZeroPadding2D)   (None, 225, 225, 3)  0           normalization_1[0][0]            
_____________________________________________________________________________________

I guess we have this weird shape of `(4, 6, 1280)` because of our own fault: We had `Input` with shape `(128, 201, 3)` while it seems that
EfficientNet expects square images (shape `(224, 224, 3)`).<br>
_However, it seems that `kkiller` used Resnest with this same weird shape without reshaping_.<br>
Let's explore a little bit.

In [61]:
pooled = keras.layers.GlobalAveragePooling2D()(output_efficient)
pooled.shape

TensorShape([None, 1280])

In [62]:
concatenated = keras.layers.Concatenate()([pooled, input_spacetime])

In [63]:
#dropped = keras.layers.Dropout(.2)(pooled)
dropped = keras.layers.Dropout(.2)(concatenated)
dropped.shape

TensorShape([None, 1283])

In [64]:
output_CNN = keras.layers.Dense(len(L_birds), activation="sigmoid")(dropped)
output_CNN.shape

TensorShape([None, 397])

In [65]:
model_image_only = keras.Model(
    #inputs=[input_mels],
    inputs=[input_mels, input_spacetime],
    outputs=[output_CNN],
)
#model_image_only.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
model_image_only.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[keras.metrics.Precision(), keras.metrics.Recall()],
)

In [66]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("model1.h5",
                                                save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
                                                  restore_best_weights=True)

class PrintF1Score(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        #print(f"logs.keys() = {logs.keys()}")  # This can check what keys logs has.
        f1_score = 2 * logs["precision"] * logs["recall"] / (logs["precision"] + logs["recall"] + EPSILON)
        val_f1_score = 2 * logs["val_precision"] * logs["val_recall"] / (logs["val_precision"] + logs["val_recall"] + EPSILON)
        print(f"f1_score: {f1_score}")
        print(f"val_f1_score: {val_f1_score}")

In [None]:
history = model_image_only.fit(
    (X_train, XX_train_normalized),
    y_train,
    batch_size=32,
    epochs=100,
    callbacks=[checkpoint_cb, early_stopping_cb, PrintF1Score()],
    validation_split=0.2,
)

Epoch 1/100


## Test Set

In [None]:
n_test_instances = df_train_soundscape.loc[is_test].shape[0]
X_test = np.empty((n_test_instances, *random_npy.shape, 3), dtype=np.float32)
print(f"X_test.shape = {X_test.shape}")

# fill in X_test
for i, row in enumerate((df_train_soundscape.loc[is_test]).itertuples()):
    mels_i = np.load(row.npy_path).astype(np.float32, copy=False)
    #X_test[i] = np.repeat(mels_i, 3).reshape((*mels_i.shape, 3))
    # The following should generate the same ndarray, and maybe faster or at least clearer
    X_test[i] = np.repeat(mels_i[..., np.newaxis], 3, axis=-1)
X_test /= 255.0
X_test.max(), X_test.min()