In [None]:
# Variables that contains the file location
import datatable as dt

# Variables that contains the file location
from files import *
from functions import *

In [None]:
file = file_blf_vardeltaspectral

In [None]:
# load the data
df = dt.fread(file)
df[dt.float64] = dt.float32  # compress
df = df.to_pandas()
df.set_index('id', inplace=True)

In [None]:
# string ids are inefficient, let's use integers and a lookup table
id_to_key = df.index.values
key_to_id = dict(zip(id_to_key, list(range(len(df.index.values)))))
indices = np.arange(len(id_to_key))

# Similarity Calculation

In [None]:
def get_cosine_similarity(arr_a, arr_b):
    norms_a = np.linalg.norm(arr_a, axis=-1)[:, np.newaxis]
    norms_b = np.linalg.norm(arr_b, axis=-1)[:, np.newaxis]
    divisor = norms_a * norms_b.T
    dot_p = arr_a @ arr_b.T
    return np.divide(dot_p, divisor, dot_p, where=divisor > 0)

In [None]:
def compute_in_batches_distance(arr: np.array, sim_function, batches: int = 1):
    """
    :param arr: full Data array
    :param sim_function: similarity function, receiving two 2 dimensional data matrices
    :param batches: split arr into chunks for less RAM usage
    :return: the full similarity matrix
    """
    splits = np.array_split(arr, batches, axis=0)
    r = np.zeros((len(arr),) * 2, dtype=np.float32)
    y = 0
    for b in tqdm(splits):
        r[:, y:y + b.shape[0]] = sim_function(arr, b)
        y += b.shape[0]
    return r

In [None]:
def compute_in_batches_top_ids(results: np.array, idx_values: np.array, top: int = -1, batches: int = 1):
    """
    :param results: a similarity matrix
    :param idx_values: the indices
    :param top: how many ids should get retrieved
    :param batches: split arr into chunks for less RAM usage
    :return:
    """
    if top < 0:
        top = len(results)
    splits = np.array_split(results, batches, axis=0)
    ids = np.zeros((len(results), top), dtype=np.int32)
    y = 0
    for b in tqdm(splits):
        ids[y:y + b.shape[0], :] = idx_values[np.argsort(b * -1, axis=1)][:, :top]
        y += b.shape[0]
    return ids

In [None]:
df.shape

In [None]:
# get the similarity matrix
result = compute_in_batches_distance(df.to_numpy(), sim_function=get_cosine_similarity, batches=100)

In [None]:
# optionally save the entire matrix
# np.save("example_similarity_matrix.npy", result)

In [None]:
# result = np.load("example_similarity_matrix.npy")

In [None]:
# normalize, if used for late fusion
np.subtract(result, result.mean(), out=result)

# np.std requires a temporary matrix, on the full results this would kill the ram
np.divide(result, result[::64, ::64].std(), out=result)

In [None]:
result.mean()

In [None]:
result[3::128, 3::128].std()

In [None]:
# replace diagonals by 0 to prevent them being picked
np.fill_diagonal(result, 0)

In [None]:
# calculate
top_ids = compute_in_batches_top_ids(result, indices, batches=100)

In [None]:
# save the results
np.save("top_ids.npy", top_ids)

# Checkpoint
Here you can restart the Kernel in case your machine runs with less than 64 GB of memory
Make sure to run the first few cells again

In [None]:
try:
    top_ids
except NameError:
    top_ids = np.load("top_ids.npy")

In [None]:
# load the genres
genres = dt.fread(file_genres_2).to_pandas()
genres.set_index('id', inplace=True)

In [None]:
top_ids_df = pd.DataFrame(top_ids)

In [None]:
# convert string ids to integer ids
genres_index = np.asarray([key_to_id[i] for i in genres.index.values])
genres = genres.set_index(genres_index)

In [None]:
# and evaluate our results
getMetrics(top_ids_df, 100, genres)

# Correlation
we only calculated one top ids result here, so this is only a dummy example

In [None]:
import scipy

correlation = np.zeros((len(top_ids_df),))
for i in tqdm(indices):
    correlation[i] = scipy.stats.kendalltau(top_ids_df.loc[i].values, top_ids_df.loc[i].values)[0]

In [None]:
correlation.mean()

Additional hints and research results
* Instead of string ids, use integers to reduce memory usage of top ids
* Memory allocation overhead can be reduced inside the similarity function
* Norms could be cached
* When splitting results, then deleting results, then processing top ids while deleting the old splits top id calculation could be done "in place"