In [None]:
import pandas as pd
import datatable as dt

# Variables that contains the file location
from files import *
from functions import *

In [None]:
# if we modify the file we need to reload it with this
import importlib
import functions  #import the module here, so that it can be reloaded.

importlib.reload(functions)

In [None]:
DATA_TYPE = np.float16

# Data
Load the Data

In [None]:
genres = dt.fread(file_genres_2).to_pandas()
genres.set_index('id', inplace=True)

In [None]:
# Our special child
blf_logfluc = dt.fread(file_blf_logfluc)
blf_logfluc[dt.float64] = dt.float32
new_cols = ['id']
new_cols.extend(list(blf_logfluc.names[2:]))
new_cols = tuple(new_cols)
del blf_logfluc[:, -1]
blf_logfluc.names = new_cols
blf_logfluc = blf_logfluc.to_pandas()
blf_logfluc.set_index('id', inplace=True)

In [None]:
files = [
    # Lyrics
    file_tfidf_2,
    file_word2vec_2,
    file_bert_2,
    # Audio
    file_essentia,
    file_blf_correlation,
    file_blf_deltaspectral,
    file_blf_spectral,
    file_blf_spectralcontrast,
    file_blf_vardeltaspectral,
    file_blf_logfluc,
    blf_logfluc,
    file_mfcc_bow,
    file_mfcc_stats,
    # Video
    file_incp,
    file_resnet,
    file_vgg19,
]

In [None]:
import csv

# Load the data, using pandas csv reader instead of datatables to get access to float16
data = []
for file in tqdm(files, desc="Loading data"):
    if type(file) == str:
        with open(file, "r") as f:
            column_names = next(csv.reader(f, delimiter="\t"))
        dtypes = {
            x: DATA_TYPE
            for x in column_names
            if x != "id"
        }
        file = pd.read_csv(file, dtype=dtypes, index_col=0, delimiter="\t")
    data.append(file)

In [None]:
# Extract readable names
names = []
for file in files:
    if type(file) == str:
        names.append(file.replace("./../task2/id_", "").replace(".tsv", ""))
    else:
        names.append("blf_logfluc")

In [None]:
for name, file in zip(names, data):
    print(name, file.shape)

# Late Fusion

In [None]:
def compute_in_batches(arr_a: np.array, arr_b: np.array, simfunction, batches: int = 1):
    # arr_a: full Data array
    # arr_b: full Data array arr_a or just some songs
    # batches: split arr_b into chunks for multiprocess and less RAM usage
    splits_b = np.array_split(arr_b, batches, axis=0)
    r = []
    for b in tqdm(splits_b):
        r.append(simfunction(arr_a, b).astype(DATA_TYPE))
    return np.concatenate(r, axis=1, dtype=DATA_TYPE)

In [None]:
def compute_in_batches_topIds(results: np.array, idx_values: np.array, top: int = 100, batches: int = 1):
    splits_b = np.array_split(results, batches, axis=0)
    return np.concatenate([idx_values[np.argsort(b * -1, axis=1)][:, :top] for b in tqdm(splits_b)], axis=0)

In [None]:
accumulated_similarity = np.zeros((len(data[0]), len(data[0])), dtype=DATA_TYPE)

# for every data
for d in tqdm(data, desc="Processing"):
    file = d.to_numpy().astype(np.float32)
    result = compute_in_batches(file, file, simfunction=get_cosine_similarity, batches=100)
    result = np.linalg.norm(result)
    accumulated_similarity += result

# Find optimal feature similarity weights

In [None]:
subset = np.random.choice(len(data[0]), 1024 * 4, replace=False)
subset.shape

In [None]:
subset_ids = data[0].index.values[subset]

similarities = []
functions = [get_cosine_similarity, get_jaccard_similarity]

# for every data
for d in tqdm(data, desc="Processing"):
    file = d.to_numpy()[subset, :].astype(np.float32)
    for f in functions:
        result = np.nan_to_num(f(file, file), 0)
        n = np.linalg.norm(result)
        if n != 0:
            result = result / n
            similarities.append(result)

print(f"{len(similarities)} similarity matrices")

In [None]:
def evaluate(similarities, weights):
    if len(similarities) == 1:
        accumulated_similarity = similarities[0]
    else:
        accumulated_similarity = np.zeros((len(subset), len(subset)), dtype=DATA_TYPE)
        for sim, weight in zip(similarities, weights):
            accumulated_similarity += (sim * weight)

    top_ids = pd.DataFrame(compute_in_batches_topIds(accumulated_similarity, subset_ids, 100, 10), subset_ids)

    return {
        "MAP@10": meanAveragePrecision(top_ids, 10, genres),
        "MAP@100": meanAveragePrecision(top_ids, 100, genres),
        "MRR@10": meanReciprocalRank(top_ids, 10, genres),
        "MRR@100": meanReciprocalRank(top_ids, 100, genres),
        "NDCG@10": ndcgMean(top_ids, 10, genres)[1],
        "NDCG@100": ndcgMean(top_ids, 100, genres)[1],
    }

In [None]:
print("Baseline", evaluate(similarities, np.zeros((len(similarities),))))

In [None]:
print("Fair", evaluate(similarities, np.ones((len(similarities),))))

In [None]:
weights = np.zeros((len(similarities),))
for i, sim in enumerate(similarities):
    weights[i] = np.nan_to_num(np.asarray(list(evaluate([sim], [1]).values())), 0).sum()
print(weights)

In [None]:
print("Performance", evaluate(similarities, weights))

In [None]:
norm_weights = (weights - weights.min()) / (weights.max() - weights.min())
print(norm_weights)
print("Normalized Performance", evaluate(similarities, norm_weights))

# Final Processing
Normalized Performance seems to be the best approach

In [None]:
final_ids = data[0].index.values
final_accumulated_similarity = np.zeros((len(final_ids), len(final_ids)), dtype=np.float32)
functions = [get_cosine_similarity, get_jaccard_similarity]

# for every data
i = 0
for d in tqdm(data, desc="Processing"):
    file = d.to_numpy().astype(np.float32)
    for f in functions:
        result = np.nan_to_num(f(file, file), 0)
        n = np.linalg.norm(result)
        if n != 0:
            result = result / n
            final_accumulated_similarity += (result * norm_weights[i])
            i += 1

In [None]:
top_ids = pd.DataFrame(compute_in_batches_topIds(final_accumulated_similarity, final_ids, 100, 100), final_ids)

results = {
    "MAP@10": meanAveragePrecision(top_ids, 10, genres),
    "MAP@100": meanAveragePrecision(top_ids, 100, genres),
    "MRR@10": meanReciprocalRank(top_ids, 10, genres),
    "MRR@100": meanReciprocalRank(top_ids, 100, genres),
    "NDCG@10": ndcgMean(top_ids, 10, genres)[1],
    "NDCG@100": ndcgMean(top_ids, 100, genres)[1],
}

In [None]:
results

In [None]:
dt.Frame(pd.DataFrame(top_ids, index=final_ids).reset_index()).to_csv('./top_ids_late_fusion.csv')