In [40]:
import glob
import json
import os
import random as rnd
import shutil
import sys

import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from IPython.core.interactiveshell import InteractiveShell
from sklearn.metrics.pairwise import euclidean_distances
from tensorflow import keras
from src.test_utils import (
    TestGenerator,
    choose_100,
    embNet,
    eval_submission,
    find_batch_size,
    get_emb_model,
    load_submission,
    pairwise_distances,
)
from tqdm import tqdm

from src.utils import euclidean_distance, loss
from _data.artist_data.ny_baseline import FeaturesLoader, NT_Xent, TestLoader, SimCLR, BasicNet, inference, get_ranked_list,eval_submission
InteractiveShell.ast_node_interactivity = "all"

In [7]:
root_dir = "/app/_data/artist_data/"
# mod_dirs = ["/app/_data/artist_data/models/test_arch/constr_7/", "/app/_data/artist_data/models/test_arch/constr_8/", "/app/_data/artist_data/models/test_arch/constr_9/"]
mod_dirs = ["/app/_data/artist_data/models/test_arch/tripl_6/"]

In [3]:
train = pd.read_csv(os.path.join(root_dir, "train.csv"))
test = pd.read_csv(os.path.join(root_dir, "test_meta.tsv"), sep="\t")
test["path"] = test["archive_features_path"].apply(
    lambda x: os.path.join(root_dir, "test_features", x)
)
train["subset"] = train["archive_features_path"].str.split("/").str[0].astype("int")

In [4]:
track2artist_map = train.set_index("trackid")["artistid"].to_dict()
track2subset_map = train.set_index("trackid")["subset"].to_dict()
artist2tracks_map = train.groupby("artistid").agg(list)["trackid"].to_dict()

## model

In [8]:
mod_paths = glob.glob(os.path.join(mod_dirs[0], '*_*.h5'))
mod_paths

['/app/_data/artist_data/models/test_arch/tripl_6/model_0.19850705564022064.h5']

In [9]:
model = keras.models.load_model(mod_paths[0], compile=False)
model = [x for x in model.layers if x.name == 'embedding'][0]

In [5]:
# embedding_nets = [embNet(path=x) for x in mod_dirs]
embedding_nets = [get_emb_model(path=x) for x in mod_dirs]

## feature vectors

In [66]:
val = train.query("subset in [2, 3]").reset_index(drop=True)
val_gen = TestGenerator(
    data=val,
    img_size=(512, 60),
    batch_size=1,
    norm=False,
    n_chanels=1,
)

In [67]:
prediction = m.predict(val_gen)



In [68]:
emb = {k: v for k,v in zip(val['trackid'].tolist(), np.squeeze(prediction))}

In [69]:
subm = get_ranked_list(embeds=emb, top_size=100, annoy_num_trees=256)

In [70]:
val_ndcg_encoder = eval_submission(submission = subm, gt_meta_info = val,top_size=100)
val_ndcg_encoder

100% 33320/33320 [00:10<00:00, 3174.52it/s]


0.19267174745578516

In [None]:
prediction = [embedding_nets[i].predict(val_gen) for i in range(len(embedding_nets))]



In [None]:
for model in embedding_nets:
    del model

In [None]:
tf.keras.backend.clear_session()

In [36]:
with tf.device("/CPU:0"):
    dists = [pairwise_distances(x) for x in prediction]

In [37]:
pred = np.mean(np.array(dists), 0)

In [12]:
# len_count = []
# for i in tqdm(train.index.tolist()):
#     nearest = np.argsort(dists[i])[1:101]
#     artist_id = train.loc[i]["artistid"]
#     ids = train.loc[nearest,"artistid"]
#     len_count.append(sum([1 for x in ids if x == artist_id]))
# plt.hist(len_count)
# plt.show();

In [24]:
prediction[2].shape

(33284, 1024)

In [26]:
pred = np.mean(np.array(prediction[1:]), 0)
with tf.device("/CPU:0"):
    dist = pairwise_distances(pred)

In [46]:
def choose_100(
    prediction, df, val=True, path_to_save=os.getcwd(), file_ix=1, n_samples=100
):
    def pairwise_distances_np(array):
        dists = (
            np.sum(np.square(array), axis=1, keepdims=True)
            + np.sum(np.square(np.transpose(array, (1, 0))), axis=0, keepdims=True)
            - 2.0 * np.matmul(array, np.transpose(array, (1, 0)))
        )
        return dists

    dists = pairwise_distances_np(prediction)
    neigh = {}
    with open(os.path.join(path_to_save, f"submission_{file_ix}"), "w") as f:
        for ix in tqdm(range(prediction.shape[0])):
            trackid = df.loc[ix, "trackid"]
            nearest_100 = np.argsort(dists[ix])[: n_samples + 1]
            tracks_100 = df.loc[nearest_100, "trackid"].tolist()
            neigh[trackid] = {"tracks": [x for x in tracks_100 if x != trackid]}
            if val:
                artist_100 = df.loc[nearest_100, "artistid"].tolist()
                neigh[trackid]["artists"] = artist_100
                neigh[trackid]["artistid"] = df.loc[ix, "artistid"]
            f.write(
                "{}\t{}\n".format(
                    trackid,
                    " ".join(list(map(str, tracks_100))),
                )
            )
    return neigh

In [71]:
neigh = choose_100(
    np.squeeze(prediction),
    df=val,
    val=True,
    path_to_save=os.getcwd(),file_ix=7
)

100% 33320/33320 [02:06<00:00, 264.08it/s]


## calculate score

In [72]:
result = load_submission(
    input_path=os.path.join(os.getcwd(), "submission_7"), max_top_size=100
)

it's better to mean predictions, not distanses?

In [73]:
from src.test_utils import eval_submission as ev_sub

In [74]:
r = ev_sub(train, result, 100)
r

{1: 0.0,
 2: 0.1971703313133137,
 3: 0.19347741106964236,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 0: 0.0}