In [None]:
import numpy as np
from scipy.stats import kendalltau
from tqdm import tqdm

from src import data
from src.data import get_features, get_similarity_for
from src.evaluation import get_metrics, evaluate_ranking
from src.fusion import late_fusion, early_fusion
from src.similarity_functions import (
    get_jaccard_similarity,
    get_cosine_similarity,
    get_overlap_similarity,
)
from src.utils import compute_top_ids_directly

In [None]:
data.DATA_SIZE = 1024

TEST_ALL_FUSIONS = True

In [None]:
experiments = []

# Single-feature
for feature in data.features.keys():
    experiments.append(
        (
            feature + "_jaccard",
            lambda: compute_top_ids_directly(
                get_features(feature), get_jaccard_similarity
            ),
        )
    )
    experiments.append(
        (
            feature + "_cosine",
            lambda: compute_top_ids_directly(
                get_features(feature), get_cosine_similarity
            ),
        )
    )


# Add early and late fusion experiments for all combinations of given sets
def add_set(lyrics_selection, audio_selection, video_selection, sim_function):
    for lyrics in lyrics_selection:
        for audio in audio_selection:
            for video in video_selection:
                sim_function_prefix = (
                    "jaccard"
                    if sim_function == get_jaccard_similarity
                    else "cosine"
                    if sim_function == get_cosine_similarity
                    else sim_function.__name__
                )

                name = f"early_{lyrics}_{audio}_{video}_{sim_function_prefix}"
                experiments.append((name, lambda: early_fusion([lyrics, audio, video], sim_function)))

                name = f"late_{lyrics}_{audio}_{video}_{sim_function_prefix}"
                experiments.append((name, lambda: late_fusion([lyrics, audio, video], sim_function)))


if TEST_ALL_FUSIONS:
    add_set(
        ["blf_spectral", "blf_logfluc", "mfcc_bow"],
        ["bert", "tfidf"],
        ["incp", "resnet"],
        get_cosine_similarity,
    )

    add_set(
        ["essentia", "blf_logfluc", "mfcc_stats"],
        ["bert", "tfidf"],
        ["vgg19", "resnet"],
        get_jaccard_similarity,
    )
else:
    raise NotImplementedError()

# Large fusions
experiments.append(("early_all_cosine", lambda: early_fusion(list(data.features.keys()), get_cosine_similarity)))
experiments.append(("early_all_jaccard", lambda: early_fusion(list(data.features.keys()), get_jaccard_similarity)))
experiments.append(("late_all_cosine", lambda: late_fusion(list(data.features.keys()), get_cosine_similarity)))
experiments.append(("late_all_jaccard", lambda: late_fusion(list(data.features.keys()), get_jaccard_similarity)))

print(f"Planned {len(experiments)}")

In [None]:
relevance_similarity = get_overlap_similarity
relevance = get_similarity_for("genre_matrix", relevance_similarity)

In [None]:
metrics = {}
for name, experiment in tqdm(experiments, "Evaluating experiments"):
    top_ids = experiment()

    metrics10 = get_metrics(top_ids, relevance, 10)
    metrics100 = get_metrics(top_ids, relevance, 100)

    metrics[name] = {
        "MAP@10": metrics10["MAP"],
        "MRR@10": metrics10["MRR"],
        "NDCG@10": metrics10["NDCG"],
        "MAP@100": metrics100["MAP"],
        "MRR@100": metrics100["MRR"],
        "NDCG@100": metrics100["NDCG"]
    }

In [None]:
RESULTS = 3

In [None]:
best = sorted([(name, metrics[name]["NDCG@100"]) for name, _ in experiments], key=lambda a: a[1], reverse=True)[:RESULTS]

In [None]:
best

In [None]:
experiment_lookup = {name: e for name, e in experiments}

In [None]:
correlations = np.zeros((len(best), len(best)))
with tqdm(total=len(best) ** 2, desc="Calculating correlations") as t:
    for i0, (name0, _) in enumerate(best):
        for i1, (name1, _) in enumerate(best):
            if name0 != name1:
                correlations[i0, i1] = evaluate_ranking(
                    experiment_lookup[name0](), experiment_lookup[name1](), lambda y0, y1: kendalltau(y0, y1)[0]
                )
            t.update()
np.fill_diagonal(correlations, 1)

In [None]:
import matplotlib.pyplot as plt

labels = [name for name, _ in best]
plt.imshow(correlations)
plt.axis("off")
plt.table([[f"{correlations[x, y]:.3f}" for x in range(len(best))] for y in range(len(best))], rowLabels=labels,
          colLabels=labels)