# Notebook with a basic example of how to run the code for diversification
## Assumes availability of "repro" folder including its contents (pre-computed matrices), that are available from our OSF repository (link in README)

In [1]:
import json
import os
import sys
import random

import numpy as np

%load_ext autoreload
%autoreload 2


# Adjust the path so that we can import packages from src folder (sibling to examples)
sys.path.append("..")

K = 10
SEED = 42

from src import metrics, diversification

In [2]:
random.seed(SEED)
np.random.seed(SEED)

## Path setting, adjust accordingly

In [3]:
# Adjust accordingly
dataset = "goodbooks"
path_to_repro = "./repro"
path_to_data = os.path.join(path_to_repro, dataset)

os.listdir(path_to_data)

['book_data_small.json',
 'distance_matrix_ease.npy',
 'distance_matrix_ease.zip',
 'distance_matrix_genres.npy',
 'distance_matrix_genres.zip',
 'distance_matrix_rating.npy',
 'distance_matrix_rating.zip',
 'distance_matrix_tags.npy',
 'distance_matrix_tags.zip',
 'distance_matrix_text.npy',
 'distance_matrix_text.zip',
 'item_index_to_id.npy',
 'item_index_to_id.pckl',
 'item_item.npy',
 'item_item.zip',
 'rating_matrix.npy',
 'rating_matrix.zip',
 'text_features.npy',
 'text_features.zip']

## Data loading

In [4]:
# Load the item_item matrix for EASE^R algorithm
item_item = np.load(os.path.join(path_to_data, "item_item.npy"))

1. CF-raw-ILD corresponded to intra_list_diversity over distance_matrix_rating.npy (available in OSF), see notes above on CF-ILD
2. CB-plot-ILD corresponded to intra_list_diversity over CLIP extracted features from item plots, see notes above on CB-ILD
3. MD-Genres-BinDiv corresponded to binomial_diversity, see notes above
4. MD-Genres-ILD corresponded to intra_list_diversity evaluated over distance_matrix_genres.npy (available in OSF)
5. CF-latent-ILD corresponded to intra_list_diversity evaluated over distance_matrix_ease.npy (available in OSF)
6. MD-Tags-ILD corresponded to intra_list_diversity evaluated over distance_matrix_tags.npy (available in OSF)

In [5]:
available_metrics = {
    "CF-raw-ILD" : "distance_matrix_rating.npy",
    "CB-plot-ILD": "distance_matrix_text.npy",
    "MD-Genres-BinDiv": None,
    "MD-Genres-ILD": "distance_matrix_genres.npy",
    "CF-latent-ILD": "distance_matrix_ease.npy",
    "MD-Tags-ILD": "distance_matrix_tags.npy"
}

# These are the metrics that were used for diversification
available_diversification_metrics = {
    "CF-raw-ILD", "CB-plot-ILD", "MD-Genres-BinDiv"
}

# Change this to metric you want to diversify w.r.t.
selected_metric = "CF-raw-ILD"

assert selected_metric in available_diversification_metrics

In [6]:
# Prepare the diversity function
if selected_metric == "MD-Genres-BinDiv":
    rating_matrix = np.load(os.path.join(path_to_data, "rating_matrix.npy"))

    if dataset == "movielens":
        data_small_path = os.path.join(path_to_data, "movie_data_small.json")
    elif dataset == "goodbooks":
        data_small_path = os.path.join(path_to_data, "book_data_small.json")
    else:
        assert False, f"Unknown dataset: {dataset}"
    
    with open(data_small_path, "r") as f:
        data_small = json.load(f)
        data_small = {int(k) : v for k,v in data_small.items()}
    
    # Prepare genre data needed for BIN-DIV
    all_genres = set()
    for m_id, x in data_small.items():
        if x and "genres" in x:
            all_genres.update(x["genres"])

    item_index_to_id = np.load(os.path.join(path_to_data, "item_index_to_id.npy"))
            
    all_categories = all_genres
    item_to_genres = {item_id : x["genres"] if x is not None and "genres" in x else [] for item_id, x in data_small.items()}
    get_item_categories = lambda item_idx: item_to_genres[item_index_to_id[item_idx]]
    
    diversity_func = metrics.binomial_diversity(all_categories, get_item_categories, rating_matrix, 0.0, dataset)
    all_items = np.arange(rating_matrix.shape[1])
else:
    assert available_metrics[selected_metric] is not None
    distance_matrix = np.load(os.path.join(path_to_data, available_metrics[selected_metric]))
    diversity_func = metrics.intra_list_diversity(distance_matrix)
    all_items = np.arange(distance_matrix.shape[0])

In [7]:
## NOTE: if you are using binomial diversity, then this might be extremely slow (~minutes) for the first time, before full-cache is built
K = 10
diversity_func(np.random.choice(all_items, K))

0.9956775325932297

In [8]:
ease = diversification.EASER_pretrained(all_items)
ease.load(os.path.join(path_to_data, "item_item.npy"))

<src.diversification.EASER_pretrained at 0x26d2e0c3b20>

In [9]:
# Simulate that user selected 20 random items during elicitation
elicitation_selected = np.random.choice(all_items, 20)
rel_scores, user_vector, ease_pred = ease.predict_with_score(elicitation_selected, elicitation_selected, K)

In [10]:
user_vector.sum() # The vector has 1 at positions corresponding to items selected during elicitation

20.0

In [11]:
ease_pred # Recommendation generated by ease

[3402, 3379, 13, 3, 2595, 4283, 482, 28, 131, 58]

In [12]:
rel_scores # For each item, the estimated relevance

array([-0.28242746,  0.06394224, -0.00872109, ..., -0.00350679,
       -0.00344402, -0.00733405], dtype=float32)

In [13]:
diversity_func(ease_pred)

0.9230546686641876

In [14]:
# Now run the diversification

def relevance_func(top_k):
    return rel_scores[top_k].sum()

print(f"Running diversification w.r.t. {selected_metric}")
print(f"Initial relevance-only recommendation: {np.array(ease_pred)}")
print("")

for alpha in [0.0, 0.5, 1.0]:
    diversified_top_k = diversification.diversify(K, rel_scores, alpha, all_items, relevance_func, diversity_func, rating_row=user_vector, filter_out_items=elicitation_selected, n_items_subset=500)
    print(f"alpha={alpha}, gives: {diversified_top_k}")
    print(f"\twhich has relevance={relevance_func(diversified_top_k)}, and diversity: {diversity_func(diversified_top_k)}")

Running diversification w.r.t. CF-raw-ILD
Initial relevance-only recommendation: [3402 3379   13    3 2595 4283  482   28  131   58]

alpha=0.0, gives: [3402 3379   13    3 2595 4283  482   28  131   58]
	which has relevance=4.317315578460693, and diversity: 0.9230546686641876
alpha=0.5, gives: [3402 4283 1232 2554 1743 2852 5174 2616 1038 3362]
	which has relevance=2.5135202407836914, and diversity: 0.9979832343737881
alpha=1.0, gives: [3402 4283 4205 2554 6529 5277 4418 5913 4904 4537]
	which has relevance=1.4175206422805786, and diversity: 1.0
