In [1]:
import glob
import json
import os
import random as rnd
import shutil

import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import StratifiedGroupKFold
from tensorflow import keras
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = "all"

In [2]:
root_dir = "/app/_data/artist_data/"
SEED = 39
BATCH_SIZE = 32
IMG_SIZE = (512, 81)
N_CHANELS = 1
INPUT_SIZE = (IMG_SIZE[0], IMG_SIZE[1], N_CHANELS)
N_FOLDS = 6
NORM = False

In [3]:
train = pd.read_csv(os.path.join(root_dir, "train.csv"))

## train_val_split

In [4]:
gkf = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
for n, (train_ids, val_ids) in enumerate(
    gkf.split(
        X=train[["artistid", "artistid_count"]],
        y=train["artistid_count"],
        groups=train["artistid"],
    )
):
    train.loc[val_ids, "fold"] = n



In [5]:
FOLD = 0
artist_ids = train["artistid"].unique().tolist()
train_set = train[train["fold"] != FOLD]["artistid"].unique()
val_set = train[train["fold"] == FOLD]["artistid"].unique()
train_data, val_data = {}, {}
for art_id in tqdm(artist_ids):
    new_df = train.query("artistid == @art_id")
    paths = new_df["path"].values.tolist()
    tracks = new_df["trackid"].values.tolist()
    if art_id in train_set:
        train_data[art_id] = {
            "tracks": tracks,
            "paths": paths,
            "count": new_df.shape[0],
        }
    else:
        val_data[art_id] = {"tracks": tracks, "paths": paths, "count": new_df.shape[0]}

100% 18468/18468 [00:43<00:00, 423.37it/s]


## DataGenerator

In [7]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, data, img_size, batch_size, norm, n_chanels, shuffle):
        self.data = data
        self.img_size = img_size
        self.batch_size = batch_size
        self.norm = norm
        self.n_chanels = n_chanels
        if self.n_chanels is None:
            self.n_chanels = 1
        self.shuffle = shuffle
        self.artist_ids = [x for x in self.data.keys()]
        if self.shuffle:
            np.random.shuffle(self.artist_ids)

    def __len__(self):
        return len(self.artist_ids) // self.batch_size

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.artist_ids)

    def load_img(self, path):
        img = np.load(path).astype("float32")
        if self.norm:
            img -= img.min()
            img /= img.max()
        if img.shape != self.img_size:
            wpad = self.img_size[1] - img.shape[1]
            wpad_l = wpad // 2
            wpad_r = wpad - wpad_l
            img = np.pad(
                img,
                pad_width=((0, 0), (wpad_l, wpad_r)),
                mode="constant",
                constant_values=0,
            )
        img = np.expand_dims(img, -1)
        if self.n_chanels != 1:
            img = np.concatenate([img, img, img], -1)
        return img

    def make_pair(self, ix, pos_label):
        artist_id = self.artist_ids[ix]
        if self.data[artist_id]["count"] < 2:
            pos_label = False
        if pos_label:
            path1, path2 = rnd.sample(self.data[artist_id]["paths"], 2)
        else:
            path1 = rnd.sample(self.data[artist_id]["paths"], 1)[0]
            new_artist_id = artist_id
            while artist_id == new_artist_id:
                new_artist_id = rnd.sample(self.artist_ids, 1)[0]
                path2 = rnd.sample(self.data[new_artist_id]["paths"], 1)[0]
        return pos_label, (path1, path2)

    def _get_one(self, ix, pos_label):
        upd_pos_label, [path1, path2] = self.make_pair(ix=ix, pos_label=pos_label)
        img1 = self.load_img(path1)
        img2 = self.load_img(path2)
        y = 0 if upd_pos_label else 1
        return (img1, img2), y

    def __getitem__(self, batch_ix):
        b_X1 = np.zeros(
            (self.batch_size, self.img_size[0], self.img_size[1], self.n_chanels),
            dtype=np.float32,
        )
        b_X2 = np.zeros(
            (self.batch_size, self.img_size[0], self.img_size[1], self.n_chanels),
            dtype=np.float32,
        )
        b_Y = np.zeros(
            self.batch_size,
            dtype=np.float32,
        )
        for i in range(self.batch_size):
            pos_label = np.random.random() > 0.5
            (b_X1[i], b_X2[i]), b_Y[i] = self._get_one(
                i + self.batch_size * batch_ix, pos_label
            )
        return {"input1": b_X1, "input2": b_X2}, b_Y

In [8]:
train_gen = DataGenerator(
    data=train_data,
    img_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    norm=NORM,
    n_chanels=N_CHANELS,
    shuffle=True,
)
val_gen = DataGenerator(
    data=val_data,
    img_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    norm=NORM,
    n_chanels=N_CHANELS,
    shuffle=False,
)

### balance between 1 and 0

In [8]:
tgs = []
for i in tqdm(range(train_gen.__len__())):
    a = train_gen.__getitem__(i)
    tgs.append(a[1].mean())
print(f"{np.mean(tgs):.3f} of negative pairs, {1 - np.mean(tgs):.3f} of negative pairs")

100% 480/480 [00:11<00:00, 40.35it/s]

0.496 of negative pairs, 0.504 of negative pairs





## Build model

In [9]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    return tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))

In [20]:
def construct_embedding_model(
    input,
    embedding_len=1024,
    n_blocks=4,
    kernel_size=(10, 3),
    activation_fn="relu",
    batch_norm=False,
):
    depth_vector = 2 ** ((np.arange(n_blocks) + 1) * 2)

    def base_block(x, i):
        x = keras.layers.Conv2D(
            depth_vector[i],
            kernel_size,
            activation=activation_fn,
            name=f"Conv2D_{i + 1}",
        )(x)
        x = keras.layers.AveragePooling2D(pool_size=(2, 2), name=f"avg_pool_{i + 1}")(x)
        return x

    if batch_norm:
        x = keras.layers.BatchNormalization()(input)
    else:
        x = input
    for i in range(n_blocks):
        x = base_block(x, i)
    x = keras.layers.Flatten(name="flatten")(x)
    # if batch_norm:
    #     x = keras.layers.BatchNormalization(name="batch_norm_last")(x)
    x = keras.layers.Dense(
        embedding_len, activation=activation_fn, name=f"dense_{embedding_len}"
    )(x)
    embedding_net = keras.Model(inputs=input, outputs=x, name=f"embedding")
    return embedding_net

In [21]:
def make_model(
    input_shape=(512, 81, 1),
    n_blocks=4,
    kernel_size=(10, 3),
    embedding_len=1024,
    activation_fn="relu",
    batch_norm=False,
):
    base_model = construct_embedding_model(
        input = keras.layers.Input(input_shape),
        embedding_len=embedding_len,
        n_blocks=n_blocks,
        kernel_size=kernel_size,
        activation_fn=activation_fn,
        batch_norm=batch_norm,
    )

    input_1 = keras.layers.Input(input_shape, name="input1")
    input_2 = keras.layers.Input(input_shape, name="input2")
    node1 = base_model(input_1)
    node2 = base_model(input_2)

    merge_layer = keras.layers.Lambda(euclidean_distance)([node1, node2])
    output_layer = keras.layers.Dense(1, activation="sigmoid")(merge_layer)
    siamese = keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    return siamese

In [22]:
# tf.keras.utils.plot_model(
#     make_model(
#         input_shape=(512, 81, 1),
#         n_blocks=4,
#         embedding_len=1024,
#         kernel_size=(10, 3),
#         activation_fn="relu",
#         batch_norm=False,
#     ),
#     to_file="mod.png",
#     show_shapes=True,
#     expand_nested=True,
# )

In [23]:
def loss(margin=1):
    def contrastive_loss(y_true, y_pred):
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

In [24]:
def make_callbacks(
    path, monitor="val_loss", mode="min", reduce_patience=10, stop_patience=100
):
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor=monitor,
            patience=stop_patience,
            restore_best_weights=True,
            verbose=1,
            mode=mode,
        ),
        keras.callbacks.ModelCheckpoint(
            os.path.join(path, "model.h5"),
            monitor=monitor,
            verbose=1,
            save_best_only=True,
            save_weights_only=False,
            mode=mode,
            save_freq="epoch",
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor=monitor,
            factor=0.9,
            patience=reduce_patience,
            verbose=1,
            mode=mode,
            min_delta=1e-4,
            min_lr=0.00000001,
        ),
        keras.callbacks.TensorBoard(
            log_dir="/app/.tensorboard/constr_3/", histogram_freq=0
        ),
        keras.callbacks.BackupAndRestore(os.path.join(path, "backup")),
        keras.callbacks.TerminateOnNaN(),
    ]
    return callbacks

In [29]:
model = make_model(
    input_shape=INPUT_SIZE,
    n_blocks=4,
    embedding_len=2048,
    kernel_size=(10, 3),
    activation_fn="relu",
    batch_norm=False,
)

In [None]:
# shutil.rmtree(mod_folder)
# shutil.rmtree('/app/.tensorboard/constr_3/')

In [31]:
mod_folder = "/app/_data/artist_data/models/test_arch/constr_3/"
# shutil.rmtree(mod_folder)
# shutil.rmtree('/app/.tensorboard/constr_3/')
optimizer = keras.optimizers.Adam(learning_rate=0.001)
callbacks = make_callbacks(
    path = mod_folder, monitor="val_loss", mode="min", reduce_patience=10, stop_patience=100
)
model.compile(
    loss=loss(margin=1),
    optimizer=optimizer,
    metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
)

In [32]:
history = model.fit(
    x=train_gen,
    batch_size=BATCH_SIZE,
    epochs=250,
    verbose="auto",
    callbacks=callbacks,
    validation_data=val_gen,
    shuffle=True,
    steps_per_epoch=len(train_data) // BATCH_SIZE,
    validation_steps=len(val_data) // BATCH_SIZE,
    validation_batch_size=BATCH_SIZE,
    validation_freq=1,
    max_queue_size=10,
    workers=10,
    use_multiprocessing=True,
)

Epoch 1/250
Epoch 1: val_loss improved from inf to 0.23333, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 2/250
Epoch 2: val_loss did not improve from 0.23333
Epoch 3/250
Epoch 3: val_loss improved from 0.23333 to 0.20430, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 4/250
Epoch 4: val_loss improved from 0.20430 to 0.19144, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 5/250
Epoch 5: val_loss improved from 0.19144 to 0.18980, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 6/250
Epoch 6: val_loss improved from 0.18980 to 0.17712, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 7/250
Epoch 7: val_loss improved from 0.17712 to 0.16848, saving model to /app/_data/artist_data/models/test_arch/constr_3/model.h5
Epoch 8/250
Epoch 8: val_loss improved from 0.16848 to 0.16782, saving model to /app/_data/artist_data/models/test_ar

Process Keras_worker_ForkPoolWorker-2186:
Process Keras_worker_ForkPoolWorker-2189:
Process Keras_worker_ForkPoolWorker-2190:
Process Keras_worker_ForkPoolWorker-2187:
Process Keras_worker_ForkPoolWorker-2182:
Process Keras_worker_ForkPoolWorker-2181:
Process Keras_worker_ForkPoolWorker-2183:
Process Keras_worker_ForkPoolWorker-2184:
Process Keras_worker_ForkPoolWorker-2188:
Traceback (most recent call last):
Traceback (most recent call last):
Process Keras_worker_ForkPoolWorker-2185:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8



  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()




  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)




  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))




  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))




  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)




  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()




  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)




  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))




  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 367, in put
    with self._wlock:




  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)




  File "/usr/lib/python3.8/multiprocessing/queues.py", line 367, in put
    with self._wlock:
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))




  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)




  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 367, in put
    with self._wlock:
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))




  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()




  File "/usr/lib/python3.8/multiprocessing/queues.py", line 368, in put
    self._writer.send_bytes(obj)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/local/lib/python3.8/dist-packages/keras/utils/data_utils.py", line 580, in get_index
    return _SHARED_SEQUENCES[uid][i]




  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:




KeyboardInterrupt




  File "/usr/lib/python3.8/multiprocessing/queues.py", line 367, in put
    with self._wlock:
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 367, in put
    with self._wlock:




KeyboardInterrupt




  File "/usr/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/tmp/ipykernel_225476/3947053097.py", line 68, in __getitem__
    b_X2 = np.zeros(




  File "/usr/lib/python3.8/multiprocessing/queues.py", line 356, in get
    res = self._reader.recv_bytes()




  File "/usr/local/lib/python3.8/dist-packages/keras/utils/data_utils.py", line 580, in get_index
    return _SHARED_SEQUENCES[uid][i]
KeyboardInterrupt




  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()




  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()




  File "/usr/lib/python3.8/multiprocessing/connection.py", line 405, in _send_bytes
    self._send(buf)




  File "/usr/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
KeyboardInterrupt




  File "/tmp/ipykernel_225476/3947053097.py", line 78, in __getitem__
    (b_X1[i], b_X2[i]), b_Y[i] = self._get_one(




KeyboardInterrupt
KeyboardInterrupt




KeyboardInterrupt




  File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)




  File "/tmp/ipykernel_225476/3947053097.py", line 59, in _get_one
    img2 = self.load_img(path2)




KeyboardInterrupt
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)




  File "/tmp/ipykernel_225476/3947053097.py", line 23, in load_img
    img = np.load(path).astype("float32")




KeyboardInterrupt
  File "/usr/local/lib/python3.8/dist-packages/numpy/lib/npyio.py", line 413, in load
    return format.read_array(fid, allow_pickle=allow_pickle,




  File "/usr/local/lib/python3.8/dist-packages/numpy/lib/format.py", line 731, in read_array
    shape, fortran_order, dtype = _read_array_header(fp, version)




  File "/usr/local/lib/python3.8/dist-packages/numpy/lib/format.py", line 594, in _read_array_header
    header = _filter_header(header)
  File "/usr/local/lib/python3.8/dist-packages/numpy/lib/format.py", line 555, in _filter_header
    for token in tokenize.generate_tokens(StringIO(s).readline):




  File "/usr/lib/python3.8/tokenize.py", line 527, in _tokenize
    start, end = pseudomatch.span(1)




KeyboardInterrupt




KeyboardInterrupt: 

Process Keras_worker_ForkPoolWorker-2194:
Process Keras_worker_ForkPoolWorker-2191:
Process Keras_worker_ForkPoolWorker-2192:
Process Keras_worker_ForkPoolWorker-2200:
Process Keras_worker_ForkPoolWorker-2197:
Process Keras_worker_ForkPoolWorker-2193:
Process Keras_worker_ForkPoolWorker-2195:
Process Keras_worker_ForkPoolWorker-2196:
Process Keras_worker_ForkPoolWorker-2199:
Process Keras_worker_ForkPoolWorker-2198:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/li



KeyboardInterrupt




KeyboardInterrupt
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)




KeyboardInterrupt
KeyboardInterrupt




KeyboardInterrupt


In [33]:
def plot_history(history):
    tran_keys = [k for k, v in history.items() if "val" not in k and k != "lr"]
    n_plots = len(tran_keys)
    n_rows = int(np.ceil(n_plots / 2))
    plt.figure(figsize=(20, 5 * n_rows))
    plt.suptitle("Training history")
    for n in range(n_plots):
        label = tran_keys[n]
        plt.subplot(n_rows, 2, n + 1)
        plt.title(label)
        plt.plot(history[label], label=f"train_{label}")
        plt.plot(history[f"val_{label}"], label=f"val_{label}")
        plt.legend()
    plt.show();

In [34]:
plot_history(history.history)

NameError: name 'history' is not defined

## save

In [34]:
mod_folder = "/app/_data/artist_data/models/test_arch/constr_1/"

In [28]:
min_val_loss_ix = np.argmin(history.history["val_loss"])
max_acc = history.history["val_accuracy"][min_val_loss_ix]
max_acc = int(np.round(max_acc * 1000))

In [29]:
max_acc

854

In [30]:
train_history = history.history
for k in train_history.keys():
    train_history[k] = list(map(float, train_history[k]))

In [31]:
model.save(
    os.path.join(mod_folder, f"model_{max_acc}.h5"),
)

In [35]:
config = {
    "loss": "constr",
    "pos_label": 0,
    "model": {
        "eff": False,
        "input_shape": (512, 81, 1),
        "n_blocks": 4,
        "embedding_len": 1024,
        "kernel_size": (10, 3),
        "activation_fn": "relu",
        "batch_norm": True,
    },
    "history": train_history,
    "norm": NORM,
    "fold": FOLD,
}

In [36]:
with open(os.path.join(mod_folder, "config.json"), "w") as f:
    json.dump(config, f)