In [1]:
import datatable as dt
# Variables that contains the file location
from files import *
from functions import *

import random
random.seed(42)

from tqdm import tqdm
tqdm.pandas()

In [2]:
# Datatype used to reduce memory required
DTYPE = np.float32

In [3]:
# string ids are inefficient, let's use integers and a lookup table
def getRelationIdsToNumbers(df):
    no_ids = len(df.index.values)
    return dict(zip(df.index.values, list(range(no_ids)))), np.arange(no_ids)


def change_id_to_keys(df, id_to_keys):
    df.set_index(np.asarray([id_to_keys[i] for i in df.index.values]), inplace=True)
    df.index.astype(np.int32, copy=False)
    df.index.name = "id"


def split_list_train_val_set(
        list_,
        training_ratio=0.6,
        validation_ratio=0.2,
        test_ratio=0.2,
        randomize=True,
):
    if training_ratio + validation_ratio + test_ratio != 1.:
        raise ValueError("Splits do not add up to 1.")
    if randomize:
        #use sample instead of random since the latter shuffles in place
        list_ = random.sample(list_, len(list_))
    total_interactions = len(list_)
    training_set = list_[:int(total_interactions * training_ratio)]  #get first 80% of file list
    validation_set = list_[int(total_interactions * training_ratio):int(
        total_interactions * (training_ratio + validation_ratio))]
    test_set = list_[int(total_interactions * (training_ratio + validation_ratio)):]

    return training_set, validation_set, test_set


def recommend_top_average_numpy(data: np.array, simfunction, listened_items, top: int = 100):
    mean_feat = np.expand_dims(data[listened_items, :].mean(axis=0), axis=0)

    ### Calculate similarities
    results = simfunction(data, mean_feat)

    ### Get Tops
    # Set the distance to the same document to -1 because we dont want it at the start.
    results[listened_items] = -1
    # Get the document indices instead of the distances
    top_values = np.argsort(results * -1, axis=0).flatten()

    return top_values[:top]


def recommend_top_average_pandas(data: pd.DataFrame, listened_items, top: int = 100):
    """
    Given a list of ids (that have been listened to by a user)
    it aggregates the features of the corresponding items
    and then retrieves the most similar tracks to the aggregated version.
    :param data:
    :param listened_items:
    :param top:
    :return: recommended_tracks
    """
    # Calculate the average over listened items
    mean_feat = data.iloc[listened_items].mean()
    # Exclude the listened items from recommendations
    target_feat = data.loc[~data.index.isin(listened_items)]
    recommended_tracks = target_feat.dot(mean_feat).sort_values(ascending=False)[:top].index

    return recommended_tracks

In [4]:
# To save memory we converted the original indices to new indices with only numbers.
# The following function create those ids or if they already exists loads them
# The ids have to be similar in all dataframes to have consistency, 
# for that we use the info indexes as base for all dataframes

file_original_new_ids = "./data/relation_original_new_ids.csv"

info = dt.fread(file_info_2).to_pandas().set_index('id')
id_to_key, indexes = getRelationIdsToNumbers(info)

if exists(file_original_new_ids):
    relation_ids = pd.read_csv(file_original_new_ids).set_index('original').astype(DTYPE)
else:

    relation_ids = pd.DataFrame(columns=['original', 'newId'])
    relation_ids['original'] = list(id_to_key.keys())
    relation_ids['newId'] = list(id_to_key.values())
    relation_ids.set_index('original', inplace=True)
    relation_ids = relation_ids.astype(DTYPE)
    relation_ids.to_csv(file_original_new_ids)

## Load datasets

Now after having a relation of original ids to new ids, we can load the datasets

In [5]:
blf_spectral = dt.fread(file_blf_spectral, header=True).to_pandas().set_index('id').astype(DTYPE)

In [6]:
datasets = {
    #    "tfidf" : tf_idf,
    #    "word2vec" : word2vec,
    #    "bert" : bert,
    #    "mfcc_bow" : mfcc_bow,
    #    "mfcc_stats" : mfcc_stats,
    #    "essentia" : essentia,
    #    "blf_delta_spectral" : blf_deltaspectral,
    #    "blf_correlation" : blf_correlation,
    #    "blf_logfluc" : blf_logfluc,
    "blf_spectral": blf_spectral,
    #    "blf_spectral_contrast" : blf_spectralcontrast,
    #    "blf_vardelta_spectral" : blf_vardeltaspectral,
    #    "incp" : incp,
    #    "vgg19" : vgg19,
    #    "resnet" : resnet,
}

#### Change the original ids to the new ids of the loaded datasets

In [7]:
for df in datasets.keys():
    change_id_to_keys(datasets[df], id_to_key)

In [8]:
for df in datasets.keys():
    print(f"Feature {df}", datasets[df].shape)

Feature blf_spectral (68641, 980)


# Function to compute distance and only retrieve the top ids

In [9]:
# Listening events of the music4all CIKM submission
user_core = 10
dataset_path = "/home/marta/jku/cikm/data/matching/binarized_listening_history_filtered.tsv"
cikm_dataset = dt.fread(dataset_path).to_pandas().drop(columns='timestamp').groupby('user')['item'].apply(
    list).reset_index(name='items')
cikm_dataset['items'] = cikm_dataset['items'].apply(
    lambda x: [id_to_key.get(item) for item in x if id_to_key.get(item) is not None])
user_core_mask = cikm_dataset['items'].apply(lambda x: len(x) >= user_core)
cikm_dataset = cikm_dataset[user_core_mask]

cikm_dataset['items']

0        [42, 85, 380, 434, 892, 1582, 1639, 1996, 2329...
1        [1255, 2118, 3534, 4423, 5536, 6173, 6656, 716...
2        [1280, 2338, 2918, 5955, 6404, 8146, 12544, 14...
3        [329, 490, 520, 623, 1310, 4128, 5199, 5506, 7...
4        [839, 2254, 2894, 2942, 3187, 3401, 4531, 5467...
                               ...                        
14113    [1249, 3323, 3958, 6404, 11812, 12840, 14389, ...
14114    [488, 960, 1255, 1690, 1950, 3495, 3606, 4416,...
14115    [3495, 3829, 4423, 5041, 6138, 6428, 7418, 809...
14116    [241, 574, 4501, 6128, 6201, 7127, 7649, 8298,...
14117    [653, 1244, 4032, 6305, 6787, 9273, 12492, 133...
Name: items, Length: 13884, dtype: object

In [10]:
cikm_dataset[['train_items', 'val_items', 'test_items']] = pd.DataFrame(cikm_dataset['items'].apply(split_list_train_val_set).tolist(), index=cikm_dataset.index)

In [13]:
%%time
len(set(recommend_top_average_numpy(
     blf_spectral.to_numpy(dtype=DTYPE),
     get_innerProduct_similarity,
     cikm_dataset['train_items'][0],
     top=100
)).intersection(set(recommend_top_average_numpy(
     blf_spectral.to_numpy(dtype=DTYPE),
     get_innerProduct_similarity,
     cikm_dataset['train_items'][1],
     top=100
))))

CPU times: user 45.5 ms, sys: 13 µs, total: 45.5 ms
Wall time: 44.1 ms


85

In [16]:
%%time
len(set(recommend_top_average_numpy(
     blf_spectral.to_numpy(dtype=DTYPE),
     get_innerProduct_similarity,
     cikm_dataset['train_items'][1],
     top=100
)).intersection(set(recommend_top_average_pandas(
        data=blf_spectral,
        listened_items=cikm_dataset['train_items'][1],
        top=100
))))

CPU times: user 90.9 ms, sys: 35 ms, total: 126 ms
Wall time: 124 ms


  recommended_tracks = target_feat.dot(mean_feat).sort_values(ascending=False)[:top].index


100

In [14]:
%%time
len(set(recommend_top_average_pandas(
        data=blf_spectral,
        listened_items=cikm_dataset['train_items'][0],
        top=100
)).intersection(set(recommend_top_average_pandas(
        data=blf_spectral,
        listened_items=cikm_dataset['train_items'][1],
        top=100
))))

CPU times: user 137 ms, sys: 62.8 ms, total: 199 ms
Wall time: 201 ms


  recommended_tracks = target_feat.dot(mean_feat).sort_values(ascending=False)[:top].index


85

In [15]:
blf_spectral_numpy = blf_spectral.to_numpy()

In [None]:
cikm_dataset['rec_items_inner'] = cikm_dataset['train_items'].progress_apply(
    lambda x: recommend_top_average_numpy(
        data=blf_spectral_numpy,
        listened_items=x,
        simfunction=get_innerProduct_similarity,
        top=100,
))

 55%|█████▍    | 7592/13884 [02:21<01:55, 54.36it/s]

In [None]:
cikm_dataset.to_csv('onion_recommendations.tsv', sep='\t', index=None)

In [None]:
cikm_dataset['rec_items_inner'] = cikm_dataset['train_items'].progress_apply(
    lambda x: recommend_top_average_numpy(
        data=blf_spectral_numpy,
        listened_items=x,
        simfunction=get_cosine_similarity,
        top=100,
))

In [None]:
cikm_dataset.to_csv('onion_recommendations.tsv', sep='\t', index=None)