# **IMPORT LIBS**

## Import

In [None]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils.evaluators import ImplicitEvalManager
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML, BPR, PMF
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import math
import os
import pickle


# **GENERATE THE DATASET**

## Init

In [None]:
# Set the seed for reproducibility
seed = 2384795
np.random.seed(seed=seed)

# Preparing folder for output data
output_name = f"./generated_data/"
if os.path.exists(output_name) == False:
    os.makedirs(output_name)

## Load the dataset

In [None]:
# Name of the dataset paths
folder_name = f"./original_files/"
file_path = 'big_matrix.csv'

# Load the training set into a DataFrame
df_train = pd.read_csv(folder_name+file_path) 

# Visualize
df_train.head(10)

## Convert to implicit

"We treat items with a watch_ratio greater than or equal to 2 as relevant, and others as irrelevant, as suggested by KuaiRec."

In [None]:
# Suggested on dataset webpage
POSITIVE_THRESHOLD = 2.0

# Add column to the DataFrame
df_train['ImplicitRating'] = np.where(df_train['watch_ratio'] > POSITIVE_THRESHOLD, 1, 0)

# Visualize
df_train.head(10)

## Check the number of users and items in the training set

"The training set contains 12,530,806 ratings given by 7,176 users against 10,728 videos through natural interactions."

In [None]:
# Store the range of ids for users
min_user = df_train["user_id"].min()
max_user = df_train["user_id"].max()

# Store the range of items
min_item = df_train["video_id"].min()
max_item = df_train["video_id"].max()

# Visualize the number of both
max_item, max_user

# **GET UNBIASED TESTSET**

## Load the unbiased testset

In [None]:
# Name of the dataset paths
file_path = folder_name + 'small_matrix.csv'

# Load the training set into a DataFrame
df_test = pd.read_csv(file_path)

## Convert to implicit

In [None]:
# Add column to the DataFrame
df_test['ImplicitRating'] = np.where(df_test['watch_ratio'] > POSITIVE_THRESHOLD, 1, 0)

# Visualize
df_test.head(10)

## Check the number of users and items in the training set

"The testing set is collected by asking a subset of 1,411 users to rate 3,327 randomly selected songs."

In [None]:
# Visualize
df_test['user_id'].unique().shape[0] , df_test["video_id"].unique().shape[0]

## Shape the unbiased test set

In [None]:
# From the dataframe, for each row where ImplicitRating is 1, append [userID, itemID] to unbiased_pos_test_set
# and for each row where ImplicitRating is 0, append [userID, itemID] to unbiased_neg_test_set
unbiased_pos_test_set = df_test[df_test["ImplicitRating"] == 1][["user_id", "video_id"]].values
unbiased_neg_test_set = df_test[df_test["ImplicitRating"] == 0][["user_id", "video_id"]].values

## Save unbiased test set

In [None]:
# Remember to split pos and neg test set into two separate files

# Get the dataframe
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

# Get couples user-item
unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

# Turn into records
structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)

# Save
np.save(output_name + "unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save(output_name + "unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)

# **GET BIASED TESTSET**

## Extract the biased test set and shape it

"We additionally held out a biased testing set (biased-testing) from the training set by randomly sampling 30% songs for each user."

In [None]:
# Precompute, for each user, the list of songs with a relevant rating
user_positive_ratings = df_train[df_train["ImplicitRating"] == 1].groupby("user_id")["video_id"].apply(set)

# Initialize the range of indexes for the items
items_ids = np.arange(min_item, max_item + 1)

# Set the number of songs for each user
# Using 3576, that is the 30% of the items in the biased set, to achieve a similar ratio with the Yahoo's dataset
SONGS_FOR_BIASED_TEST = 3576 

# Init empty
pos_test_set = []
neg_test_set = []

# Extract the biased test set
for user_id in range(min_user, max_user + 1):

    # Get SONGS_FOR_BIASED_TEST items
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])

    # Get which are positive
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    # Set the positive ones to 0 in the training set (extract)
    df_train.loc[(df_train['video_id'].isin(pos_ids)) & (df_train['user_id'] == user_id), 'ImplicitRating'] = 0

    # Append items
    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

# Get np arrays
pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

## Save the biased test set

In [None]:
# Remember to split pos and neg test set into two separate files

# Get the dataframe
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

# Get couples user-item
pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

# Turn into records
structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

# Save
np.save(output_name + "biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_name + "biased-test_arr_neg.npy", structured_data_neg_test_set)

## **STORE TRAINSET**

## Filter positive couples (user, item)

In [None]:
# Only take the couples (user, item) with relevant rating
new_df = df_train[df_train['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['watch_ratio', 'ImplicitRating','play_duration','video_duration','time','date','timestamp'])

# Define a dictionary for renaming columns
rename_dict = {
    'user_id': 'user_id',
    'video_id': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)

# Convert the DataFrame to a structured array
train_data = new_df.to_records(index=False) 

## Save the training set

In [None]:
# Save 
np.save(output_name + "training_arr.npy", train_data)

# **MODEL CHOICE**

In [None]:
# Here I won't comment anything, we are just using the code provided by the authors of the paper

raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['max_user'] = 7177
raw_data['max_item'] = 10729
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')

MODEL_CLASS = CML
MODEL_PREFIX = "cml"
DATASET_NAME = "KuaiRec"
OUTPUT_FOLDER = output_name
OUTPUT_PATH = OUTPUT_FOLDER + MODEL_PREFIX + "-" + DATASET_NAME + "/"
OUTPUT_PREFIX = str(OUTPUT_PATH) + str(MODEL_PREFIX) + "-" + str(DATASET_NAME)


if os.path.exists(OUTPUT_PATH) == False:
    os.makedirs(OUTPUT_PATH)


# **TRAIN THE MODEL**

In [None]:
# Prevent tensorflow from using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()

# Define the model
model = MODEL_CLASS(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, eval_save_prefix=OUTPUT_PATH + DATASET_NAME, item_serving_size=500)
auc_evaluator = AUC()

# Train the model
model_trainer.train(num_itr=10001, display_itr=display_itr)

# Save in the output folder
model.save(OUTPUT_PATH,None)

# Delete the model from the memory
del model

# **DEFINING FUNCTIONS**

In [None]:
def eq(infilename, infilename_neg, trainfilename, gamma=-1.0, K=1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    
    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    
    Zui = dict()
    Ni = dict()
    
    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    
    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    
    # Calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0

        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            pui = np.power(Ni[theitem], (gamma + 1) / 2.0)

            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) / pui
            
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 / pui
            denominator += 1 / pui
                
        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [None]:
def aoa(infilename, infilename_neg, trainfilename, K=1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    
    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    
    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset
    
    # Count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    
    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    # Calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0

        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser]))
            # Calcolo il Recall a 30, vedi nota 6 paper
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0
            denominator += 1 

        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [None]:
def stratified(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 
    linspace = np.linspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

    # Compute bias' numerator
    bias = 0.0
    for k in pui.keys():
        # add |pui*w - 1!|
        bias += abs(pui[k] * w[k] - 1)
    # Multiply by number of users
    bias *= len(P["users"])

    # Compute concentrations numerator (for each user)
    concentrations = {}
    max_w = max(w.values())
    # ... by computing the sum of squares of w for each user
    for user, item in zip(trainset['user_id'], trainset['item_id']):
        # Iterate over the trainset to compute the sum of squares for each user
        if item in w:
            if user not in concentrations:
                concentrations[user] = 0
            concentrations[user] += w[item] ** 2
    # ... and then applying the formula
    for user in concentrations:
        concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
    # Now sum all the concentrations
    concentration = sum(concentrations.values())

    # Calculate per-user scores
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

In [None]:
def stratified_logspace(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 

    # Maybe try to split the logspace instead of the linspace?
    logspace = np.logspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= logspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        # Is the average the only good choice? even with the log space split?
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

        # Compute bias' numerator
        bias = 0.0
        for k in pui.keys():
            # add |pui*w - 1!|
            bias += abs(pui[k] * w[k] - 1)
        # Multiply by number of users
        bias *= len(P["users"])

        # Compute concentrations numerator (for each user)
        concentrations = {}
        max_w = max(w.values())
        # ... by computing the sum of squares of w for each user
        for user, item in zip(trainset['user_id'], trainset['item_id']):
            # Iterate over the trainset to compute the sum of squares for each user
            if item in w:
                if user not in concentrations:
                    concentrations[user] = 0
                concentrations[user] += w[item] ** 2
        # ... and then applying the formula
        for user in concentrations:
            concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
        # Now sum all the concentrations
        concentration = sum(concentrations.values())

    # Compute score with AUC and compute Recall
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

In [None]:
# This version uses the linspace of the number of number of items used for evaluation, not of the propensities
def stratified_2(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the 0 to len(item_sorted...)
    linspace = np.linspace(0, len(items_sorted_by_value), partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and i < linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

    # Compute bias' numerator
    bias = 0.0
    for k in pui.keys():
        # add |pui*w - 1!|
        bias += abs(pui[k] * w[k] - 1)
    # Multiply by number of users
    bias *= len(P["users"])

    # Compute concentrations numerator (for each user)
    concentrations = {}
    max_w = max(w.values())
    # ... by computing the sum of squares of w for each user
    for user, item in zip(trainset['user_id'], trainset['item_id']):
        # Iterate over the trainset to compute the sum of squares for each user
        if item in w:
            if user not in concentrations:
                concentrations[user] = 0
            concentrations[user] += w[item] ** 2
    # ... and then applying the formula
    for user in concentrations:
        concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
    # Now sum all the concentrations
    concentration = sum(concentrations.values())

    # Compute score with AUC and compute Recall
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

# **EVALUATION**

In [None]:
# Load data
raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['test_data_pos_biased'] = np.load(output_name + "biased-test_arr_pos.npy")
raw_data['test_data_neg_biased'] = np.load(output_name + "biased-test_arr_neg.npy")
raw_data['test_data_pos_unbiased'] = np.load(output_name + "unbiased-test_arr_pos.npy")
raw_data['test_data_neg_unbiased'] = np.load(output_name + "unbiased-test_arr_neg.npy")
raw_data['max_user'] = 7177
raw_data['max_item'] = 10729
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

# Load data
train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
test_dataset_pos_biased = ImplicitDataset(raw_data['test_data_pos_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_biased = ImplicitDataset(raw_data['test_data_neg_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_pos_unbiased = ImplicitDataset(raw_data['test_data_pos_unbiased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_unbiased = ImplicitDataset(raw_data['test_data_neg_unbiased'], raw_data['max_user'], raw_data['max_item'])

In [None]:
# Prevent tensorflow from using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()

# Define the model
model = MODEL_CLASS(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, eval_save_prefix=OUTPUT_PATH + DATASET_NAME, item_serving_size=500)
auc_evaluator = AUC()

# Load model
model.load(OUTPUT_PATH)

# Set parameters
model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator])
# Had to increment it, original 200 now?
model_trainer._num_negatives = 200
model_trainer._exclude_positives([train_dataset, test_dataset_pos_biased, test_dataset_neg_biased])
model_trainer._sample_negatives(seed=10)

## Biased Evaluation

In [None]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-biased"
model_trainer._evaluate_partial(test_dataset_pos_biased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-biased"
model_trainer._evaluate_partial(test_dataset_neg_biased)

## Unbiased Evaluation

In [None]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-unbiased"
model_trainer._evaluate_partial(test_dataset_pos_unbiased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-unbiased"
model_trainer._evaluate_partial(test_dataset_neg_unbiased)

## Calculate Metrics

Compute AOA and unbiased evaluator metrics with biased testset.

In [None]:
biased_results = dict()

# biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30, partition=100)
biased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", K=10)
biased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10)
biased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10)
biased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10)
biased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10)

Compute AOA and unbiased evaluator metrics with unbiased testset.

In [None]:
unbiased_results = dict()

# unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1, partition=100)
unbiased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", K=10)
unbiased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10)
unbiased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10)
unbiased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10)
unbiased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10)

Get partitions.

In [None]:
# Get number of items
num_items = max_item

# Get the n_p partitions
n_p = 500
nums = np.arange(1, num_items+1)
partitions = np.random.choice(nums, n_p, replace=False)

# Visualize
partitions

Compute the partition which minimizes the sum of AUC and Recall

In [None]:
# Compute biased and unbiased results with stratified for each partition
# and store biased and unbiased results such that the sum of AUC and Recall is minimized

# Value of gamma to use for minimization
gamma = 15

# To print :)
key = "STRATIFIED_" + str(gamma).replace(".","")

# Initialize results
unbiased_results[key] = dict()
biased_results[key] = dict()
best_partition = np.random.choice(nums, 1)[0]

# For each partition
for p in tqdm(partitions):
    # Compute the results (AUC and Recall) for both biased and unbiased test sets
    temp_unbiased = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=10, partition=p)
    temp_biased = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=10, partition=p)
    # If first iteration...
    if not unbiased_results[key]:
        unbiased_results[key] = temp_unbiased
    if not biased_results[key]:
        biased_results[key] = temp_biased
    # Else if a better partition was found, update the results
    elif temp_unbiased['bias'] + temp_unbiased['concentration'] + temp_biased['bias'] + temp_biased['concentration'] < biased_results[key]['bias'] + biased_results[key]['concentration'] + unbiased_results[key]['bias'] + unbiased_results[key]['concentration']:
        biased_results[key]['auc'] = temp_biased['auc']
        biased_results[key]['recall'] = temp_biased['recall']
        biased_results[key]['bias'] = temp_biased['bias']
        biased_results[key]['concentration'] = temp_biased['concentration']
        unbiased_results[key]['auc'] = temp_unbiased['auc']
        unbiased_results[key]['recall'] = temp_unbiased['recall']
        biased_results[key]['bias'] = temp_biased['bias']
        biased_results[key]['concentration'] = temp_biased['concentration']
        best_partition = p

So, for the chosen value of gamma, the best partition is...

In [None]:
# Visualize
best_partition

Compute stratified metrics with unbiased testset.

In [None]:
unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)
biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)
biased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)

unbiased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)
biased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)
biased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)


This version uses the linspace of items instead of linspace of propensities to make the partition.

In [None]:
unbiased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)

Prepare table for results.

In [None]:
#len(list(value.keys()))
rows = 2 
#len(list(biased_results.items()))
columns = 13

# Init results
results_array = np.zeros((rows,columns))

Fill the table with the MAE results.

In [None]:
# Init dictionary
mae_results = dict()

# Get the names of the rows
list_biased_res = list(biased_results.keys())

# For each row
for i in range(len(list_biased_res)):
    key = list_biased_res[i]

    # For each column
    for j in range(len(list(biased_results[key].keys()))):
        key_2 = list(biased_results[key].keys())[j]

        # Compute MAE
        results_array[j][i] = abs(biased_results[key][key_2] - unbiased_results[key][key_2])

# Make it a DataFrame
mae_df = pd.DataFrame(columns=list(biased_results.keys()), data=results_array)
metric_values = list(biased_results[list(biased_results.keys())[0]].keys())
mae_df.insert(0, "metric", metric_values)

# **RESULTS**

In [None]:
# Visualize
mae_df.head()