In [1]:
import os
import time

import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances_chunked
from sklearn.metrics.pairwise import euclidean_distances
from tqdm import tqdm

In [2]:
root_dir = "/app/_data/artist_data/"
test = pd.read_csv(os.path.join(root_dir, "test_meta.tsv"), sep="\t")
file_dir = os.path.join(root_dir, "test_features")

In [3]:
def get_top_k(dist_chunk, start):
    top_size = 100
    result = []
    for chunk_item_indx, item_distances in enumerate(dist_chunk):
        global_query_item_indx = start + chunk_item_indx
        ranked_list = list(enumerate(item_distances))
        ranked_list.sort(key=lambda x: x[1])
        current_item_result = []
        for result_indx, distance in ranked_list:
            if result_indx == global_query_item_indx:
                continue
            current_item_result.append(result_indx)
            if len(current_item_result) >= top_size:
                break
        result.append(current_item_result)
    return result

In [4]:
trackids = []
embeds = []
for _, row in test.iterrows():
    features_filepath = os.path.join(file_dir, row["archive_features_path"])
    track_features = np.load(features_filepath)
    track_embed = np.mean(track_features, axis=1)
    trackids.append(row["trackid"])
    embeds.append(track_embed)
embeds = np.array(embeds)

In [5]:
start = time.time()

with open("submission", "w") as foutput:
    current_item_indx = 0
    for chunk in tqdm(pairwise_distances_chunked(
        embeds, metric="cosine", working_memory=100, reduce_func=get_top_k, n_jobs=16
    )):
        for item_ranked_list in chunk:
            foutput.write(
                "{}\t{}\n".format(
                    trackids[current_item_indx],
                    " ".join([str(trackids[i]) for i in item_ranked_list]),
                )
            )
            current_item_indx += 1

total_time = time.time() - start
print(f"It takes {(total_time/60):.2f} minutes to run this cell")

131it [23:27, 10.74s/it]

It takes 23.46 minutes to run this cell





In [6]:
start = time.time()


def choose_100(prediction, df, val=True, path_to_save=None):
    dists = euclidean_distances(prediction)
    neigh = {}
    with open(os.path.join(path_to_save, "my_test_submission"), "w") as f:
        for ix in tqdm(range(prediction.shape[0])):
            trackid = df.loc[ix, "trackid"]
            nearest_100 = np.argsort(dists[ix])[1:101]
            tracks_100 = df.loc[nearest_100, "trackid"].tolist()
            neigh[trackid] = {"tracks": [x for x in tracks_100 if x != trackid]}
            if val:
                artist_100 = df.loc[nearest_100, "artistid"].tolist()
                neigh[trackid]["artists"] = artist_100
            f.write(
                "{}\t{}\n".format(
                    trackid,
                    " ".join(list(map(str, tracks_100))),
                )
            )
    return neigh


neigh = choose_100(prediction=embeds, df=test, val=False, path_to_save=os.getcwd())
total_time = time.time() - start
print(f"It takes {(total_time/60):.2f} minutes to run this cell")

100% 41377/41377 [02:19<00:00, 296.75it/s]


It takes 2.68 minutes to run this cell
