In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
path_naive = Path("../input/birdclef-2021-naive-npy")
path_naive.exists()

In [None]:
df_shortaudio_train = pd.read_csv(path_naive/"shortaudio_train.csv")
df_shortaudio_val = pd.read_csv(path_naive/"shortaudio_val.csv")

df_soundscape_train = pd.read_csv(path_naive/"soundscape_train.csv")
df_soundscape_val = pd.read_csv(path_naive/"soundscape_val.csv")
df_soundscape_test = pd.read_csv(path_naive/"soundscape_test.csv")

In [None]:
df_soundscape_val.head()

In [None]:
df_shortaudio_train.head()

I have forgotten to add `month_x`, etc. to `shortaudio_{train,val,test}.csv`. Let's make that up.

This is not necessarily a bad thing -- By forgetting this, our `.csv` files are more ligth-weighted.

In [None]:
def cyclicize_number(number, max_, min_):
    """
    args
        number, int
            \in {min_, min_ + 1, ..., max_}
            e.g. hour => min_ = 0, max_ = 24
                 longitude => min_ = -180, max_ = 180
        max_, int
        min_, int
    return
        (x, y), tuple of float
    """
    period = max_ - min_
    theta = 2 * np.pi * (number / period)
    #theta = 2 * np.pi * ((number - min_) / period)
    x = np.cos(theta)
    y = np.sin(theta)
    return x, y

# N.B. Using the next function to deal with df_train_soundscape is
#      not efficient, since there are only 4 distinct longitudes.
def cyclicize_series(series, max_, min_):
    return list(map(lambda number: cyclicize_number(number, max_, min_), series))

In [None]:
df_shortaudio_train[["month_x", "month_y"]] = cyclicize_series(df_shortaudio_train["month"], 12, 0)
df_shortaudio_train[["day_coarse_x", "day_coarse_y"]] = cyclicize_series(df_shortaudio_train["day"], 31, 0)
df_shortaudio_train[["longitude_x", "longitude_y"]] = cyclicize_series(df_shortaudio_train["longitude"], 180, -180)
df_shortaudio_train["latitude_normalized"] = df_shortaudio_train["latitude"] / 90
df_shortaudio_train.head()

In [None]:
df_shortaudio_val[["month_x", "month_y"]] = cyclicize_series(df_shortaudio_val["month"], 12, 0)
df_shortaudio_val[["day_coarse_x", "day_coarse_y"]] = cyclicize_series(df_shortaudio_val["day"], 31, 0)
df_shortaudio_val[["longitude_x", "longitude_y"]] = cyclicize_series(df_shortaudio_val["longitude"], 180, -180)
df_shortaudio_val["latitude_normalized"] = df_shortaudio_val["latitude"] / 90

- Produce common columns for the two diff types of dataframes
- Build `df_train`, `df_val`, `df_test`
- Build `XX_train`, `XX_val`, `XX_test`

In [None]:
L_feature_columns = [
    "month_x",
    "month_y",
    "day_coarse_x",
    "day_coarse_y",
    "longitude_x",
    "longitude_y",
    "latitude_normalized",
]

L_common_columns = L_feature_columns + [
    "npy_filename",
    "primary_label",
]

In [None]:
df_soundscape_train["npy_filename"] = df_soundscape_train["row_id"] + ".npy"
df_soundscape_val["npy_filename"] = df_soundscape_val["row_id"] + ".npy"
df_soundscape_test["npy_filename"] = df_soundscape_test["row_id"] + ".npy"

In [None]:
df_soundscape_train.rename(
    {"birds": "primary_label"},
    axis="columns",
    inplace=True,
)
df_soundscape_val.rename(
    {"birds": "primary_label"},
    axis="columns",
    inplace=True,
)
df_soundscape_test.rename(
    {"birds": "primary_label"},
    axis="columns",
    inplace=True,
)
"primary_label" in df_soundscape_train.columns

In [None]:
df_train = pd.concat([
    df_shortaudio_train[L_common_columns],
    df_soundscape_train[L_common_columns],
])
df_train.shape, df_shortaudio_train.shape, df_soundscape_train.shape, len(L_common_columns)

In [None]:
df_train.head()

In [None]:
df_val = pd.concat([
    df_shortaudio_val[L_common_columns],
    df_soundscape_val[L_common_columns],
])
df_val.shape, df_shortaudio_val.shape, df_soundscape_val.shape, len(L_common_columns)

In [None]:
df_test = df_soundscape_test[L_common_columns]
df_test.shape

Maybe we need to shuffle before assigning `df_train[L_feature_columns].value` to `XX_train`.

In [None]:
df_train = df_train.sample(frac=1)
df_val = df_val.sample(frac=1)

In [None]:
XX_train = df_train[L_feature_columns].values
XX_val = df_val[L_feature_columns].values

In [None]:
df_train

## Dataset Generator
Why switch to using generator? How large is our data this time? Could you make an estimate?



In [None]:
import tensorflow as tf
import tensorflow.keras as keras

In [None]:
for i in range(5):
    npy_filename_i = df_train["npy_filename"].iloc[i]
    print(npy_filename_i)

In [None]:
df_train["npy_filename"].head()

In [None]:
XX_train.shape, df_train.shape

In [None]:
XX_train[100]

In [None]:
XX_train[100].shape

In [None]:
PATH_DATASET = Path("../input/birdclef-2021")

L_birds = [path.name for path
           in (PATH_DATASET / "train_short_audio").iterdir()]
L_birds = sorted(L_birds)
D_label_index = {label: i for i, label in enumerate(L_birds)}
D_index_label = {v: k for k, v in D_label_index.items()}

def label(series):
    #I = np.eye(len(D_label_index))
    y = np.zeros((len(series), len(D_label_index)), dtype=np.float32)
    for i, string in enumerate(series.values):
    #for i, string in enumerate(series):
        if string == "nocall":
            continue
        else:
            L_indices = [D_label_index[label] for label in string.split(" ")]
            #row_i = np.sum(I[L_indices], axis=0)
            #y[i] = row_i
            y[i, L_indices] = 1
    return y

y_train = label(df_train["primary_label"])
np.unique(np.sum(y_train, axis=-1))

In [None]:
y_train.shape

In [None]:
y_val = label(df_val["primary_label"])
#y_test = label(df_test["primary_label"])

In [None]:
import random
random.choice(list((path_naive / "train_npy").iterdir()))

In [None]:
random_npy_path = _

In [None]:
random_npy = np.load(random_npy_path)
random_npy.dtype, random_npy.shape

In [None]:
h, w = random_npy.shape

In [None]:
from tensorflow.keras.utils import Sequence
from joblib import Parallel, delayed

class DatasetGenerator(Sequence):
    def __init__(self, df, XX, y, h, w, is_train=True, batch_size=32):
        self.df = df
        self.XX = XX
        self.y = y
        self.h = h
        self.w = w
        self.is_train = is_train
        self.batch_size = batch_size

    def __len__(self):
        #return self.df.shape[0]
        return self.df.shape[0] // self.batch_size

    #def image_processing(i, batch_npy_filename, batch_image):
    #    npy_filename_i = batch_npy_filename.iloc[i]
    #    image_i = np.load(path_naive / f"train_npy/{npy_filename_i}").astype(np.float32, copy=False)
    #    image_i /= 255.0
    #    image_i = np.repeat(image_i[..., np.newaxis], 3, axis=-1)  # shape: (h, w, 3)
    #    batch_image[i] = image_i
    def image_processing(self, i):
        npy_filename_i = self.batch_npy_filename.iloc[i]
        if self.is_train:
            image_i = np.load(path_naive / f"train_npy/{npy_filename_i}").astype(np.float32, copy=False)
        else:
            image_i = np.load(path_naive / f"val_npy/{npy_filename_i}").astype(np.float32, copy=False)
        image_i /= 255.0
        image_i = np.repeat(image_i[..., np.newaxis], 3, axis=-1)  # shape: (h, w, 3)
        self.batch_image[i] = image_i

    def __getitem__(self, idx):
        self.batch_npy_filename = self.df["npy_filename"].iloc[idx*self.batch_size: (idx + 1)*self.batch_size]
        #batch_image = np.empty((self.batch_size, self.h, self.w, 3), dtype=np.float32)
        self.batch_image = np.zeros((self.batch_size, self.h, self.w, 3), dtype=np.float32)

        ## joblib, multiprocessing
        #tasks = [delayed(self.image_processing)(i) for i in range(self.batch_size)]
        #pool = Parallel(n_jobs=8)
        #pool(tasks)
        
        ## single-core implementation
        for i in range(self.batch_size):
            self.image_processing(i)
        batch_X = [self.batch_image, self.XX[idx*self.batch_size: (idx + 1)*self.batch_size]]
        batch_y = self.y[idx*self.batch_size: (idx + 1)*self.batch_size]
        return batch_X, batch_y
 

In [None]:
k = 0
for ((i, f), y) in DatasetGenerator(df_train, XX_train, y_train, h, w):
    if k > 10:
        break
    print(i.shape)
    print(f.shape)
    print(y.shape)
    k += 1


In [None]:
from tensorflow.keras.applications import EfficientNetB0
import tensorflow.keras as keras

In [None]:
input_mels = keras.layers.Input(shape=(*random_npy.shape, 3), name="input_mels")
input_spacetime = keras.layers.Input(shape=(XX_train.shape[1],),
                                     name="input_spacetime")

output_efficient = EfficientNetB0(include_top=False, weights="imagenet")(input_mels)
pooled = keras.layers.GlobalAveragePooling2D()(output_efficient)
concatenated = keras.layers.Concatenate()([pooled, input_spacetime])
#concatenated = keras.layers.concatenate([pooled, input_spacetime])
#dropped = keras.layers.Dropout(.2)(pooled)
dropped = keras.layers.Dropout(.2)(concatenated)
output_CNN = keras.layers.Dense(len(L_birds), activation="sigmoid")(dropped)
model = keras.Model(
    #inputs=[input_mels],
    inputs=[input_mels, input_spacetime],
    outputs=[output_CNN],
)
#model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[keras.metrics.Precision(), keras.metrics.Recall()],
)

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("model1.h5",
                                                save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
                                                  restore_best_weights=True)

EPSILON = 1e-6
class PrintF1Score(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        #print(f"logs.keys() = {logs.keys()}")  # This can check what keys logs has.
        f1_score = 2 * logs["precision"] * logs["recall"] / (logs["precision"] + logs["recall"] + EPSILON)
        val_f1_score = 2 * logs["val_precision"] * logs["val_recall"] / (logs["val_precision"] + logs["val_recall"] + EPSILON)
        print(f"f1_score: {f1_score}")
        print(f"val_f1_score: {val_f1_score}")

In [None]:
model.summary()

In [None]:
history = model.fit(
    DatasetGenerator(df_train, XX_train, y_train, h, w),
    batch_size=32,
    epochs=100,
    validation_data=DatasetGenerator(df_val, XX_val, y_val, h, w),
    callbacks=[checkpoint_cb, early_stopping_cb, PrintF1Score()],
)

### Debug
The problem might be that the dataset generator should not have been rewritten this simply, i.e. using simple generator with `yield`.

In [None]:
keras.layers.Concatenate()([np.zeros((32, 1280)), np.zeros((32, 7))])

In [None]:
k = 0
for ((i, f), y) in trainset_generator():
    if k > 10:
        break
    print(i.shape)
    print(f.shape)
    print(y.shape)
    k += 1

In [None]:
128 * 201 * 3