In [1]:
import glob
import json
import os
import random
from argparse import ArgumentParser

import annoy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import StratifiedGroupKFold
from tqdm import tqdm

In [2]:
from _data.artist_data.compute_score import load_submission
from _data.artist_data.ny_baseline import (
    FeaturesLoader,
    NT_Xent,
    SimCLR,
    SimCLR_infer,
    TestLoader,
    TrainLoader,
    compute_dcg,
    eval_submission,
    get_ideal_dcg,
    get_ranked_list,
    inference,
    position_discounter,
    train_val_split,
    save_submission
)

In [3]:
from sandbox.ny_base_experiments import BasicNet

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
root_dir = "/app/_data/artist_data/"
test_dir = "/app/_data/artist_data/test_features/"
mod_dir = "/app/_data/artist_data/baseline/"
train = pd.read_csv(os.path.join(root_dir, "train_meta.tsv"), sep="\t")
test = pd.read_csv(os.path.join(root_dir, "test_meta.tsv"), sep="\t")
test["path"] = test["archive_features_path"].apply(
    lambda x: os.path.join(root_dir, "test_features", x)
)

In [7]:
mod_paths = glob.glob("/app/_data/artist_data/subm_mod/exp_tanh_[0-6]/")
mod_paths

['/app/_data/artist_data/subm_mod/exp_tanh_5/',
 '/app/_data/artist_data/subm_mod/exp_tanh_4/',
 '/app/_data/artist_data/subm_mod/exp_tanh_2/',
 '/app/_data/artist_data/subm_mod/exp_tanh_3/',
 '/app/_data/artist_data/subm_mod/exp_tanh_1/',
 '/app/_data/artist_data/subm_mod/exp_tanh_6/',
 '/app/_data/artist_data/subm_mod/exp_tanh_0/']

In [8]:
def get_ranked_list(embeds, top_size=100, annoy_num_trees=1024):
    annoy_index = None
    annoy2id = []
    id2annoy = dict()
    for track_id, track_embed in tqdm(embeds.items()):
        id2annoy[track_id] = len(annoy2id)
        annoy2id.append(track_id)
        if annoy_index is None:
            annoy_index = annoy.AnnoyIndex(len(track_embed), "angular")
        annoy_index.add_item(id2annoy[track_id], track_embed)
    annoy_index.build(annoy_num_trees, n_jobs=-1)
    ranked_list = dict()
    for track_id in tqdm(embeds.keys()):
        candidates = annoy_index.get_nns_by_item(id2annoy[track_id], top_size + 1)[
            1:
        ]  # exclude trackid itself
        candidates = list(filter(lambda x: x != id2annoy[track_id], candidates))
        ranked_list[track_id] = [annoy2id[candidate] for candidate in candidates]
    return ranked_list

In [9]:
def load_model(path, device):
    weights = torch.load(os.path.join(path, "best_module.pt"))
    with open(os.path.join(path, "args.json"), "r") as f:
        cfg = json.load(f)
    model = SimCLR_infer(
        encoder=BasicNet(cfg["n_chahels"], 128, 3), projection_dim=256
    ).to(device)
    model.load_state_dict(weights)
    model.eval()
    return model

In [10]:
models = []
for p in mod_paths:
    models.append(load_model(path=p, device=device))

In [11]:
test_feature_loader = FeaturesLoader(
    features_dir_path="/app/_data/artist_data/test_features/", meta_info=test, device=device, test=False, crop_size=60
)

In [12]:
test_loader = TestLoader(
    features_loader=test_feature_loader, batch_size=256, features_size=(512, 60)
)

In [13]:
all_embeds = []
for model in models:
    embeds = inference(model, test_loader)
    all_embeds.append(embeds)

100% 41377/41377 [00:48<00:00, 857.03it/s]
100% 41377/41377 [00:44<00:00, 934.90it/s]
100% 41377/41377 [00:44<00:00, 920.97it/s]
100% 41377/41377 [00:45<00:00, 906.71it/s]
100% 41377/41377 [00:44<00:00, 933.71it/s]
100% 41377/41377 [00:45<00:00, 919.05it/s]
100% 41377/41377 [00:45<00:00, 911.09it/s]


In [17]:
all_subm = []
for n, embeds in enumerate(all_embeds):
    submission = get_ranked_list(embeds, 300, 128)
    all_subm.append(submission)
    # save_submission(
    #     submission, f"/app/_data/artist_data/ens_submissions/final_{n}.txt"
    # )

100% 41377/41377 [00:02<00:00, 15216.29it/s]
100% 41377/41377 [02:36<00:00, 264.61it/s]
100% 41377/41377 [00:02<00:00, 14381.26it/s]
100% 41377/41377 [02:36<00:00, 264.40it/s]
100% 41377/41377 [00:03<00:00, 12121.70it/s]
100% 41377/41377 [02:35<00:00, 265.65it/s]
100% 41377/41377 [00:02<00:00, 13889.83it/s]
100% 41377/41377 [02:37<00:00, 263.11it/s]
100% 41377/41377 [00:02<00:00, 15573.55it/s]
100% 41377/41377 [02:37<00:00, 262.59it/s]
100% 41377/41377 [00:02<00:00, 15919.08it/s]
100% 41377/41377 [02:38<00:00, 261.62it/s]
100% 41377/41377 [00:02<00:00, 15836.89it/s]
100% 41377/41377 [02:38<00:00, 261.24it/s]


In [None]:
# from _data.artist_data.compute_score import load_submission


In [18]:
sub7 = load_submission(input_path='/app/_data/artist_data/subm_mod/exp_tanh_7/submission_prj_1000_2831.txt', max_top_size=300)

In [20]:
# sub7

In [21]:
all_subm.append(sub7)

In [22]:
len(all_subm)

8

In [23]:
submissions = all_subm

In [24]:
avg_submission = {
    "mean": {},
    "top4": {},
    "top5": {},
    "top6": {},
    "top7": {},
    "top8": {},
    "mid": {},
}
for trackid in tqdm(test["trackid"].tolist()):
    track_ranks = {}

    all_tracks = np.zeros([8, 300])
    for i in range(len(submissions)):
        if len(submissions[i][trackid]) < 300:
            submissions[i][trackid].append(0)
        all_tracks[i] = submissions[i][trackid][:300]
    values, counts = np.unique(
        all_tracks,
        return_index=False,
        return_inverse=False,
        return_counts=True,
    )
    values = values[counts >= 4]
    names = ["trackid", "count", "mean", "top4", "top5", "top6", "top7", "top8", "mid"]
    arr = np.zeros([values.shape[0], len(names)])
    # df = pd.DataFrame()
    for v in range(values.shape[0]):
        rank = np.argwhere(all_tracks == values[v])[:, 1]
        arr[v, 0] = values[v]  # trackid
        arr[v, 1] = rank.shape[0]  # count
        arr[v, 2] = rank.mean()  # mean
        arr[v, 3] = np.sort(rank)[:4].mean()  # top4
        arr[v, 4] = np.sort(rank)[:5].mean()  # top5
        arr[v, 5] = np.sort(rank)[:6].mean()  # top6
        arr[v, 6] = np.sort(rank)[:7].mean()  # top7
        arr[v, 7] = np.sort(rank)[:8].mean()  # top8
        arr[v, 8] = np.sort(rank)[1:-1].mean()  # mid
#         df.loc[v, "trackid"] = values[v]
#         df.loc[v, "ids"] = " ".join(list(map(str, np.sort(rank))))
#         df.loc[v, "count"] = rank.shape[0]
#         df.loc[v, "mean"] = rank.mean()
#         df.loc[v, "top4"] = np.sort(rank)[:4].mean()
#         df.loc[v, "top5"] = np.sort(rank)[:5].mean()
#         df.loc[v, "top6"] = np.sort(rank)[:6].mean()
#         df.loc[v, "top7"] = np.sort(rank)[:7].mean()
#         df.loc[v, "top8"] = np.sort(rank)[:8].mean()
#         df.loc[v, "median"] = np.sort(rank)[1:-1].mean()

#     df["trackid"] = df["trackid"].astype("int")

    avg_submission["top4"][trackid] = arr[
        np.lexsort((arr[:, names.index("mid")], arr[:, names.index("top4")]))
    ][:, 0][:100].astype("int")
    avg_submission["top5"][trackid] = arr[
        np.lexsort((arr[:, names.index("mid")], arr[:, names.index("top5")]))
    ][:, 0][:100].astype("int")
    avg_submission["top6"][trackid] = arr[
        np.lexsort((arr[:, names.index("mid")], arr[:, names.index("top6")]))
    ][:, 0][:100].astype("int")
    # avg_submission["top7"][trackid] = arr[
    #     np.lexsort((arr[:, names.index("mid")], arr[:, names.index("top7")]))
    # ][:, 0][:100].astype("int")
    # avg_submission["top8"][trackid] = arr[
    #     np.lexsort((arr[:, names.index("mid")], arr[:, names.index("top8")]))
    # ][:, 0][:100].astype("int")
    # avg_submission["mean"][trackid] = arr[
    #     np.lexsort((arr[:, names.index("mid")], arr[:, names.index("mean")]))
    # ][:, 0][:100].astype("int")
    avg_submission["mid"][trackid] = arr[
        np.lexsort((arr[:, names.index("top6")], arr[:, names.index("mid")]))
    ][:, 0][:100].astype("int")
    # for n in ["mean", "top4", "top5", "top6", "top7",'top8']:
    #     np.array_equal(df.sort_values(by=[n, "median"]).head(100)["trackid"].values, np.array(avg_submission[n][trackid]))
    # for n in ["mean", "top4", "top5", "top6", "top7", "top8", "median"]:
    #     v = (
    #         df.sort_values(by=[n, "median"])
    #         .head(100)["ids"]
    #         .str.split()
    #         .apply(lambda x: list(map(int, x)))
    #         .values
    #     )
    #     s = []
    #     for i in v:
    #         s.extend(i)
    #     s = list(set(s))
    #     if max(s) > 500:
    #         break

100% 41377/41377 [25:56<00:00, 26.59it/s]


In [25]:
def save_submission(submission, submission_path, submission_name, top=100):
    with open(os.path.join(submission_path, submission_name), "w") as f:
        for query_trackid, result in submission.items():
            f.write("{}\t{}\n".format(query_trackid, " ".join(map(str, result[:top]))))

In [26]:
for n in [ "top4", "top5", "top6", "mid"]:
    save_submission(
        avg_submission[n], "/app/_data/artist_data/ens_submissions/", f"{n}_final.txt"
    )

In [4]:
train_df, val_df = train_val_split(train, 7, seed=42)



In [5]:
val_floader = FeaturesLoader(
    features_dir_path="/app/_data/artist_data/train_features/",
    meta_info=val_df,
    test=False,
    device=device,
    crop_size=60,
)

In [6]:
val_loader = TestLoader(
    features_loader=val_floader, batch_size=256, features_size=(512, 60)
)

In [18]:
mod_paths = glob.glob("/app/_data/artist_data/subm_mod/512_256_*sh*/")

In [19]:
mod_paths.append("/app/_data/artist_data/subm_mod/512_256_7_trans2/")

In [20]:
mod_paths

['/app/_data/artist_data/subm_mod/512_256_2_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_7_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_0_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_6_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_1_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_3_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_4_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_5_shuffle/',
 '/app/_data/artist_data/subm_mod/512_256_7_trans2/']

In [31]:
class BasicNet1D(nn.Module):
    def __init__(self, output_features_size1=512, kernel_size=3):
        super().__init__()
        self.output_features_size = output_features_size1
        self.conv_1 = nn.Conv1d(
            512, output_features_size1, kernel_size=kernel_size, padding=1
        )
        self.conv_2 = nn.Conv1d(
            output_features_size1,
            output_features_size1,
            kernel_size=kernel_size,
            padding=1,
        )
        self.mp_1 = nn.MaxPool1d(2, 2)
        self.conv_3 = nn.Conv1d(
            output_features_size1,
            output_features_size1,
            kernel_size=kernel_size,
            padding=1,
        )
        self.conv_4 = nn.Conv1d(
            output_features_size1,
            output_features_size1,
            kernel_size=kernel_size,
            padding=1,
        )

    def forward(self, x):
        x = F.relu(self.conv_1(x))
        x = F.relu(self.conv_2(x))
        x = self.mp_1(x)
        x = F.relu(self.conv_3(x))
        x = self.conv_4(x).mean(axis=2)

        return x


class SimCLR_infer(nn.Module):
    def __init__(self, encoder, projection_dim):
        super().__init__()
        self.encoder = encoder
        self.n_features = encoder.output_features_size
        self.projection_dim = projection_dim
        self.projector = nn.Sequential(
            nn.Linear(self.n_features, self.n_features, bias=False),
            nn.ReLU(),
            nn.Linear(self.n_features, self.projection_dim, bias=False),
        )

    def forward(
        self,
        x,
    ):
        x = self.encoder(x)
        x = self.projector(x)
        return x

In [41]:
def load_model(path, device):
    weights = torch.load(os.path.join(path, "best_module.pt"))
    with open(os.path.join(path, "args.json"), "r") as f:
        cfg = json.load(f)
    if "trans" in path:
        model = SimCLR_infer(
            encoder=BasicNet(cfg["n_chahels"], 128, 3), projection_dim=256
        ).to(device)
    else:
        model = SimCLR_infer(
            encoder=BasicNet1D(cfg["n_chahels"], 3), projection_dim=256
        ).to(device)
    model.load_state_dict(weights)
    model.eval()
    model.to(device)
    return model

In [42]:
models = []
for p in mod_paths:
    models.append(load_model(path=p, device=device))

In [57]:
def ensemble_inference(models, loader):
    embeds = dict()
    for n, model in enumerate(models):
        for tracks_ids, tracks_features in loader:
            with torch.no_grad():
                tracks_embeds = model(tracks_features)
                for track_id, track_embed in zip(tracks_ids, tracks_embeds):
                    if track_id not in embeds.keys():
                        embeds[track_id] = [track_embed.cpu().numpy()]
                    else:
                        embeds[track_id].append(track_embed.cpu().numpy())
    return embeds

In [93]:
embeds = ensemble_inference(models, val_loader)

100% 20864/20864 [00:18<00:00, 1143.06it/s]
100% 20864/20864 [00:18<00:00, 1155.71it/s]
100% 20864/20864 [00:18<00:00, 1154.77it/s]
100% 20864/20864 [00:18<00:00, 1151.53it/s]
100% 20864/20864 [00:18<00:00, 1134.47it/s]
100% 20864/20864 [00:18<00:00, 1129.67it/s]
100% 20864/20864 [00:18<00:00, 1144.42it/s]
100% 20864/20864 [00:18<00:00, 1130.57it/s]
100% 20864/20864 [00:18<00:00, 1144.28it/s]


In [98]:
arr_emb = {k: np.array(v).sum(0) for k,v in embeds.items()}

In [99]:
arr_emb[65336].shape

(256,)

In [101]:
submission = get_ranked_list(arr_emb)

100% 20864/20864 [00:01<00:00, 15247.03it/s]
100% 20864/20864 [02:32<00:00, 136.73it/s]


In [102]:
ndcg = eval_submission(submission, gt_meta_info=val_df, top_size=100)

100% 20864/20864 [00:07<00:00, 2711.01it/s]


In [103]:
ndcg

0.3834791599111589

In [77]:
ndcg

0.395118474378242

## test

In [104]:
test_feature_loader = FeaturesLoader(
    features_dir_path="/app/_data/artist_data/test_features/", meta_info=test, device=device, test=False, crop_size=60
)

In [105]:
test_loader = TestLoader(
    features_loader=test_feature_loader, batch_size=256, features_size=(512, 60)
)

In [106]:
test_embeds = ensemble_inference(models, test_loader)

100% 41377/41377 [00:44<00:00, 920.97it/s]
100% 41377/41377 [00:45<00:00, 914.24it/s]
100% 41377/41377 [00:44<00:00, 937.89it/s] 
100% 41377/41377 [00:44<00:00, 928.18it/s]
100% 41377/41377 [00:44<00:00, 928.08it/s] 
100% 41377/41377 [00:44<00:00, 936.67it/s]
100% 41377/41377 [00:45<00:00, 917.77it/s] 
100% 41377/41377 [00:44<00:00, 937.61it/s] 
100% 41377/41377 [00:45<00:00, 909.47it/s]


In [110]:
test_arr_emb = {k: np.array(v).mean(0) for k,v in test_embeds.items()}

In [111]:
submission = get_ranked_list(test_arr_emb, 100, 1024)

100% 41377/41377 [00:02<00:00, 14155.49it/s]
100% 41377/41377 [06:02<00:00, 114.18it/s]


In [112]:
save_submission(
        submission, "/app/_data/artist_data/ens_submissions/mod9_sum.txt"
    )