In [1]:
import datatable as dt
# Variables that contains the file location
from files import *
from functions import *

In [2]:
# Datatype used to reduce memory required
DTYPE = np.float32

In [3]:
# string ids are inefficient, let's use integers and a lookup table
def getRelationIdsToNumbers(df):
    no_ids = len(df.index.values)
    return dict(zip(df.index.values, list(range(no_ids)))), np.arange(no_ids)

def change_id_to_keys(df, id_to_keys):
    df.set_index(np.asarray([id_to_keys[i] for i in df.index.values]), inplace=True)
    df.index.astype(np.int32, copy=False)
    df.index.name = "id"

In [4]:
# To save memory we converted the original indices to new indices with only numbers.
# The following function create those ids or if they already exists loads them
# The ids have to be similar in all dataframes to have consistency, 
# for that we use the info indexes as base for all dataframes

file_original_new_ids = "./data/relation_original_new_ids.csv"

info  = dt.fread(file_info_2).to_pandas().set_index('id')
id_to_key, indexes = getRelationIdsToNumbers(info)


if exists(file_original_new_ids):
    relation_ids = pd.read_csv(file_original_new_ids).set_index('original').astype(DTYPE)
else:
    
    relation_ids = pd.DataFrame(columns=['original', 'newId'])
    relation_ids['original'] = list(id_to_key.keys())
    relation_ids['newId'] = list(id_to_key.values())
    relation_ids.set_index('original', inplace=True)
    relation_ids = relation_ids.astype(DTYPE)
    relation_ids.to_csv(file_original_new_ids)

In [5]:
# Relation of original id, to new id
relation_ids.head(2)

Unnamed: 0_level_0,newId
original,Unnamed: 1_level_1
0009fFIM1eYThaPg,0.0
0010xmHR6UICBOYT,1.0


## Load datasets

Now after having a relation of original ids to new ids, we can load the datasets

In [6]:

blf_spectral         = dt.fread(file_blf_spectral, header=True).to_pandas().set_index('id').astype(DTYPE)

In [7]:
blf_spectral

Unnamed: 0_level_0,BLF_SPEC0000,BLF_SPEC0001,BLF_SPEC0002,BLF_SPEC0003,BLF_SPEC0004,BLF_SPEC0005,BLF_SPEC0006,BLF_SPEC0007,BLF_SPEC0008,BLF_SPEC0009,...,BLF_SPEC0970,BLF_SPEC0971,BLF_SPEC0972,BLF_SPEC0973,BLF_SPEC0974,BLF_SPEC0975,BLF_SPEC0976,BLF_SPEC0977,BLF_SPEC0978,BLF_SPEC0979
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009fFIM1eYThaPg,-0.021460,-0.017230,-0.013651,-0.010841,-0.008861,-0.006210,-0.004352,-0.002953,-0.001022,0.001472,...,-0.014418,-0.012594,-0.010883,-0.009404,-0.007405,-0.004714,-0.001504,0.001562,0.002280,0.003981
0010xmHR6UICBOYT,-0.015971,-0.011813,-0.008262,-0.006075,-0.004016,-0.001755,-0.000327,0.000765,0.005019,0.007121,...,-0.037724,-0.036631,-0.033416,-0.025909,-0.019597,-0.014301,-0.008480,-0.005743,-0.002760,-0.001834
002Jyd0vN4HyCpqL,-0.031292,-0.027408,-0.024437,-0.022585,-0.020734,-0.019001,-0.016573,-0.014984,-0.012862,-0.010895,...,-0.033851,-0.033016,-0.032165,-0.031635,-0.031027,-0.030402,-0.029791,-0.029393,-0.028596,-0.027498
006TYKNjNxWjfKjy,-0.026226,-0.020437,-0.017091,-0.014997,-0.012571,-0.010764,-0.009016,-0.007655,-0.004926,-0.002946,...,-0.022048,-0.021373,-0.020522,-0.019261,-0.018786,-0.017677,-0.016329,-0.015060,-0.013350,-0.011999
007LIJOPQ4Sb98qV,-0.030571,-0.027113,-0.023412,-0.021092,-0.019181,-0.016823,-0.014973,-0.013623,-0.011772,-0.009829,...,-0.016440,-0.015750,-0.014590,-0.014135,-0.013179,-0.012625,-0.011895,-0.010558,-0.008714,-0.006631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzx8CWdM7qkxKQpC,-0.008507,-0.007632,-0.006980,-0.006499,-0.006107,-0.005328,-0.004778,-0.004196,-0.003391,-0.002375,...,-0.022113,-0.020828,-0.019674,-0.018529,-0.017750,-0.017105,-0.015854,-0.014755,-0.013779,-0.012756
zzyb5LvKJTWLVnrk,-0.013039,-0.008594,-0.005242,-0.003475,-0.000889,-0.000050,0.001697,0.003320,0.005534,0.007092,...,-0.021782,-0.018594,-0.017796,-0.015881,-0.014963,-0.013843,-0.010778,-0.009325,-0.006056,-0.004453
zzz0n04uuTUA7fNh,-0.019679,-0.013652,-0.010467,-0.007722,-0.006534,-0.003744,-0.002786,0.000261,0.001674,0.004614,...,-0.010651,-0.009944,-0.009250,-0.008561,-0.007822,-0.006463,-0.005218,-0.004602,-0.003833,-0.002630
zzznMjZAKnJJXQSj,-0.023609,-0.019298,-0.015960,-0.013821,-0.011858,-0.010505,-0.007782,-0.005596,-0.003299,-0.000732,...,-0.013456,-0.011406,-0.010718,-0.009863,-0.009205,-0.008743,-0.008279,-0.007691,-0.007015,-0.006289


In [8]:
datasets = {
#    "tfidf" : tf_idf,
#    "word2vec" : word2vec,
#    "bert" : bert,
#    "mfcc_bow" : mfcc_bow,
#    "mfcc_stats" : mfcc_stats,
#    "essentia" : essentia,
#    "blf_delta_spectral" : blf_deltaspectral,
#    "blf_correlation" : blf_correlation,
#    "blf_logfluc" : blf_logfluc,
    "blf_spectral" : blf_spectral,
#    "blf_spectral_contrast" : blf_spectralcontrast,
#    "blf_vardelta_spectral" : blf_vardeltaspectral,
#    "incp" : incp,
#    "vgg19" : vgg19,
#    "resnet" : resnet,
}

#### Change the original ids to the new ids of the loaded datasets

In [9]:
for df in datasets.keys():
    change_id_to_keys(datasets[df], id_to_key)

In [10]:
for df in datasets.keys():
    print(f"Feature {df}" , datasets[df].shape)

Feature blf_spectral (68641, 980)


# Function to compute distance and only retrieve the top ids

In [40]:
# Listening events of the music4all CIKM submission
user_core = 10
dataset_path = "/home/marta/jku/cikm/data/matching/binarized_listening_history_filtered.tsv"
cikm_dataset = dt.fread(dataset_path).to_pandas().drop(columns='timestamp').groupby('user')['item'].apply(list).reset_index(name='items')
cikm_dataset['items'] = cikm_dataset['items'].apply(lambda x: [id_to_key.get(item) for item in x if id_to_key.get(item) is not None])
user_core_mask = cikm_dataset['items'].apply(lambda x: len(x) >= user_core)
cikm_dataset = cikm_dataset[user_core_mask]

cikm_dataset['items']

0        [42, 85, 380, 434, 892, 1582, 1639, 1996, 2329...
1        [1255, 2118, 3534, 4423, 5536, 6173, 6656, 716...
2        [1280, 2338, 2918, 5955, 6404, 8146, 12544, 14...
3        [329, 490, 520, 623, 1310, 4128, 5199, 5506, 7...
4        [839, 2254, 2894, 2942, 3187, 3401, 4531, 5467...
                               ...                        
14113    [1249, 3323, 3958, 6404, 11812, 12840, 14389, ...
14114    [488, 960, 1255, 1690, 1950, 3495, 3606, 4416,...
14115    [3495, 3829, 4423, 5041, 6138, 6428, 7418, 809...
14116    [241, 574, 4501, 6128, 6201, 7127, 7649, 8298,...
14117    [653, 1244, 4032, 6305, 6787, 9273, 12492, 133...
Name: items, Length: 13884, dtype: object

In [41]:
import random

In [42]:
def split_list_train_val_set(
        list,
        training_ratio=0.6,
        validation_ratio=0.2,
        test_ratio=0.2,
        randomize=True,
):
    if training_ratio + validation_ratio + test_ratio != 1.:
        raise ValueError("Splits do not add up to 1.")
    if randomize:
        #use sample instead of random since the latter shuffles in place
        list = random.sample(list, len(list))
    total_interactions = len(list)
    training_set = list[:int(total_interactions * training_ratio)] #get first 80% of file list
    validation_set = list[int(total_interactions * training_ratio):int(total_interactions * (training_ratio+validation_ratio))]
    test_set = list[int(total_interactions * (training_ratio+validation_ratio)):]

    return training_set, validation_set, test_set

In [43]:
cikm_dataset[['train_items', 'val_items', 'test_items']] = pd.DataFrame(cikm_dataset['items'].apply(split_list_train_val_set).tolist(), index=cikm_dataset.index)

In [52]:
def recommend_top_average(data: pd.DataFrame, listened_items, top: int=100):
    """
    Given a list of ids (that have been listened to by a user)
    it aggregates the features of the corresponding items
    and then retrieves the most similar tracks to the aggregated version.
    :param data:
    :param listened_items:
    :param top:
    :return: recommended_tracks
    """
    # Calculate the average over listened items
    mean_feat = data.iloc[listened_items].mean()
    # Exclude the listened items from recommendations
    target_feat = data.loc[~data.index.isin(listened_items)]
    recommended_tracks = target_feat.dot(mean_feat).sort_values(ascending=False)[:top].index

    return recommended_tracks

In [55]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
#cikm_dataset['train_items'] = \
cikm_dataset['train_items'].progress_apply(
    lambda x: recommend_top_average(
        data=blf_spectral,
        listened_items=x,
        top=100
))

  recommended_tracks = target_feat.dot(mean_feat).sort_values(ascending=False)[:top].index
  4%|▍         | 569/13884 [00:57<21:32, 10.30it/s]