# **IMPORT LIBS**

## Import

In [2]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils.evaluators import ImplicitEvalManager
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML, BPR, PMF
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler
from tqdm.notebook import tqdm
import numpy as np
import math
import pandas as pd
import os
import pickle


# **GENERATE THE DATASET**

## Init

In [3]:
# Set the seed for reproducibility
seed = 2384795
np.random.seed(seed=seed)

# Preparing folder for output data
output_name = f"./generated_data/"
if os.path.exists(output_name) == False:
    os.makedirs(output_name)

# **MODEL CHOICE**

In [4]:
# Here I won't comment anything, we are just using the code provided by the authors of the paper

raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['max_user'] = 15401
raw_data['max_item'] = 1001
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')

MODEL_CLASS = CML
MODEL_PREFIX = "cml"
DATASET_NAME = "yahoo"
OUTPUT_FOLDER = output_name
OUTPUT_PATH = OUTPUT_FOLDER + MODEL_PREFIX + "-" + DATASET_NAME + "/"
OUTPUT_PREFIX = str(OUTPUT_PATH) + str(MODEL_PREFIX) + "-" + str(DATASET_NAME)


if os.path.exists(OUTPUT_PATH) == False:
    os.makedirs(OUTPUT_PATH)


# **EVALUATION**

In [5]:
# Load data
raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['test_data_pos_biased'] = np.load(output_name + "biased-test_arr_pos.npy")
raw_data['test_data_neg_biased'] = np.load(output_name + "biased-test_arr_neg.npy")
raw_data['test_data_pos_unbiased'] = np.load(output_name + "unbiased-test_arr_pos.npy")
raw_data['test_data_neg_unbiased'] = np.load(output_name + "unbiased-test_arr_neg.npy")
raw_data['max_user'] = 15401
raw_data['max_item'] = 1001
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

# Load data
train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
test_dataset_pos_biased = ImplicitDataset(raw_data['test_data_pos_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_biased = ImplicitDataset(raw_data['test_data_neg_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_pos_unbiased = ImplicitDataset(raw_data['test_data_pos_unbiased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_unbiased = ImplicitDataset(raw_data['test_data_neg_unbiased'], raw_data['max_user'], raw_data['max_item'])

In [6]:
# Prevent tensorflow from using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()

# Define the model
model = MODEL_CLASS(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, eval_save_prefix=OUTPUT_PATH + DATASET_NAME, item_serving_size=500)
auc_evaluator = AUC()

# Load model
model.load(OUTPUT_PATH)

# Set parameters
model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator])
model_trainer._num_negatives = 200
model_trainer._exclude_positives([train_dataset, test_dataset_pos_biased, test_dataset_neg_biased])
model_trainer._sample_negatives(seed=10)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
keep_dims is deprecated, use keepdims instead

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





2024-06-10 19:58:55.226790: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2024-06-10 19:58:55.275329: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2419200000 Hz
2024-06-10 19:58:55.280478: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5654e0851660 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2024-06-10 19:58:55.280536: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version



INFO:tensorflow:Restoring parameters from ./generated_data/cml-yahoo/
[Subsampling negative items]


                                                     

## Biased Evaluation

In [7]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-biased"
model_trainer._evaluate_partial(test_dataset_pos_biased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-biased"
model_trainer._evaluate_partial(test_dataset_neg_biased)

100%|██████████| 2070/2070 [00:01<00:00, 1053.50it/s]
100%|██████████| 2296/2296 [00:44<00:00, 52.12it/s]


{'AUC': [0.49967796610169485,
  0.5082203389830507,
  0.5008221476510067,
  0.5002181208053691,
  0.49372053872053867,
  0.4746610169491526,
  0.518506711409396,
  0.5479560810810811,
  0.49776845637583883,
  0.5143265993265993,
  0.5179152542372881,
  0.4952364864864865,
  0.48668341708542723,
  0.4925333333333333,
  0.5158159722222222,
  0.4890939597315436,
  0.5106521739130435,
  0.4566836734693877,
  0.5097315436241611,
  0.5299494949494948,
  0.5154180602006688,
  0.5342123287671233,
  0.506989966555184,
  0.4591610738255033,
  0.4782608695652174,
  0.47272887323943663,
  0.48060137457044677,
  0.4904013377926421,
  0.4699829931972789,
  0.5305050505050505,
  0.4823154362416108,
  0.48531986531986526,
  0.48807291666666663,
  0.5047826086956522,
  0.46753355704697985,
  0.49755033557046985,
  0.5465384615384616,
  0.4745238095238095,
  0.49320469798657707,
  0.4888006756756757,
  0.4819932432432432,
  0.43794827586206897,
  0.5505972696245733,
  0.49168350168350167,
  0.4831543624

## Unbiased Evaluation

In [8]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-unbiased"
model_trainer._evaluate_partial(test_dataset_pos_unbiased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-unbiased"
model_trainer._evaluate_partial(test_dataset_neg_unbiased)

100%|██████████| 2296/2296 [00:01<00:00, 1305.78it/s]
100%|██████████| 2296/2296 [00:03<00:00, 754.67it/s]


{'AUC': [0.5816666666666667,
  0.525,
  0.46749999999999997,
  0.5138888888888888,
  0.43555555555555553,
  0.5342857142857143,
  0.37166666666666676,
  0.5528571428571428,
  0.41874999999999996,
  0.4511111111111111,
  0.3425,
  0.47285714285714286,
  0.5835714285714287,
  0.403125,
  0.54,
  0.5227777777777778,
  0.47611111111111115,
  0.516875,
  0.49357142857142866,
  0.5322222222222223,
  0.51625,
  0.5199999999999999,
  0.3577777777777778,
  0.489375,
  0.6537499999999999,
  0.294375,
  0.59,
  0.583125,
  0.411875,
  0.6044444444444443,
  0.2621428571428571,
  0.48888888888888893,
  0.45999999999999996,
  0.4927777777777778,
  0.45500000000000007,
  0.6407142857142858,
  0.678125,
  0.4683333333333333,
  0.5394444444444445,
  0.3585714285714286,
  0.54375,
  0.41125,
  0.661875,
  0.655,
  0.41111111111111115,
  0.491875,
  0.6221428571428572,
  0.3499999999999999,
  0.5194444444444444,
  0.47888888888888886,
  0.5294444444444445,
  0.485,
  0.37999999999999995,
  0.440555555555

# **DEFINING FUNCTIONS**

In [9]:
def eq(infilename, infilename_neg, trainfilename, gamma=-1.0, K=1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    
    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    
    Zui = dict()
    Ni = dict()
    
    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    
    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    
    # Calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0

        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            pui = np.power(Ni[theitem], (gamma + 1) / 2.0)

            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) / pui
            
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 / pui
            denominator += 1 / pui
                
        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : "Nope",
        "concentration" : "Nope"
    }

In [10]:
def aoa(infilename, infilename_neg, trainfilename, K=1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    
    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    
    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset
    
    # Count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    
    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    # Calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0

        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser]))
            # Calcolo il Recall a 30, vedi nota 6 paper
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0
            denominator += 1 

        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : "To be compute",
        "concentration" : "Nope"
    }

In [11]:
def stratified(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    #del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 
    linspace = np.linspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

    # Compute bias' numerator
    bias = 0.0
    for k in pui.keys():
        # add |pui*w - 1!|
        bias += abs(pui[k] * w[k] - 1)
    # Multiply by number of users
    bias *= len(P["users"])

    # Compute concentrations numerator (for each user)
    concentrations = {}
    max_w = max(w.values())
    # ... by computing the sum of squares of w for each user
    for user, item in zip(trainset['user_id'], trainset['item_id']):
        # Iterate over the trainset to compute the sum of squares for each user
        if item in w:
            if user not in concentrations:
                concentrations[user] = 0
            concentrations[user] += w[item] ** 2
    # ... and then applying the formula
    for user in concentrations:
        concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
    # Now sum all the concentrations
    concentration = sum(concentrations.values())

    # Calculate per-user scores
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    #del trainset

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

In [12]:
def stratified_logspace(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 

    # Maybe try to split the logspace instead of the linspace?
    logspace = np.logspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= logspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        # Is the average the only good choice? even with the log space split?
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

        # Compute bias' numerator
        bias = 0.0
        for k in pui.keys():
            # add |pui*w - 1!|
            bias += abs(pui[k] * w[k] - 1)
        # Multiply by number of users
        bias *= len(P["users"])

        # Compute concentrations numerator (for each user)
        concentrations = {}
        max_w = max(w.values())
        # ... by computing the sum of squares of w for each user
        for user, item in zip(trainset['user_id'], trainset['item_id']):
            # Iterate over the trainset to compute the sum of squares for each user
            if item in w:
                if user not in concentrations:
                    concentrations[user] = 0
                concentrations[user] += w[item] ** 2
        # ... and then applying the formula
        for user in concentrations:
            concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
        # Now sum all the concentrations
        concentration = sum(concentrations.values())

    # Compute score with AUC and compute Recall
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    # Return
    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

In [13]:
# This version uses the linspace of the number of number of items used for evaluation, not of the propensities
def stratified_2(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10, delta=0.1):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    #del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the 0 to len(item_sorted...)
    linspace = np.linspace(0, len(items_sorted_by_value), partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and i < linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg

        j += 1

    # Compute bias' numerator
    bias = 0.0
    for k in pui.keys():
        # add |pui*w - 1!|
        bias += abs(pui[k] * w[k] - 1)
    # Multiply by number of users
    bias *= len(P["users"])

    # Compute concentrations numerator (for each user)
    concentrations = {}
    max_w = max(w.values())
    # ... by computing the sum of squares of w for each user
    for user, item in zip(trainset['user_id'], trainset['item_id']):
        # Iterate over the trainset to compute the sum of squares for each user
        if item in w:
            if user not in concentrations:
                concentrations[user] = 0
            concentrations[user] += w[item] ** 2
    # ... and then applying the formula
    for user in concentrations:
        concentrations[user] = math.sqrt(concentrations[user] * 2 * math.log(2/delta)) + max_w * 7 * math.log(2/delta)
    # Now sum all the concentrations
    concentration = sum(concentrations.values())

    # Compute score with AUC and compute Recall
    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count,
        "bias"      : bias,
        "concentration" : concentration
    }

# **COMPUTE RESULTS**

Compute AOA and unbiased evaluator metrics with biased testset.

In [14]:
biased_results = dict()

# biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30, partition=100)
biased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", K=30)
biased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30)
biased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=30)
biased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=30)
biased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=30)

Compute AOA and unbiased evaluator metrics with unbiased testset.

In [15]:
unbiased_results = dict()

# unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1, partition=100)
unbiased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", K=1)
unbiased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1)
unbiased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=1)
unbiased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=1)
unbiased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=1)

Get partitions.

In [16]:
# Get number of items
num_items = raw_data['max_item']

# Get the n_p partitions
n_p = 300
nums = np.arange(1, num_items+1)
partitions = np.random.choice(nums, n_p, replace=False)

# Visualize
partitions

array([ 742,  977,  141,  707,  751,   24,  273,  959,  193,  200, 1001,
        206,  783,  662,  295,  384,  228,  975,  340,  122,  146,  740,
        818,  747,  346,  852,  849,  579,   20,  284,  197,  727,  910,
        332,  251,  599,  529,  463,  192,  873,  502,  309,  731,   11,
        366,  581,  108,  400,  294,  524,  486,  265,  530,  445,  883,
        476,  168,  672,   38,  666,  984,  848,    7,  874,  356,    2,
        490,  545,  955,  365,  189,  701,  993,   99,  823,  885,  128,
        832,  680,  804,  654,  329,  521,  746,  691,  318,   55,  936,
        715,  429,  768,   40,   51,  506,  458,   18,  244,  648,  409,
        548,  694,  567,  730,  411,  864,  172,  272,   15,  227,  427,
        606,  917,  809,  646,  515,  838,  651,  511,  266,  420,  861,
         62,   46,  290,  522,  372,   93,   19,  773,  793,  302,  886,
         26,  705,  781,  395,  456,  958,  785,  483,  334,  644,  536,
        937,  156,  577,  297,  704,  561,  617,  8

Compute the partition which minimizes the sum of AUC and Recall

In [17]:
# Compute biased and unbiased results with stratified for each partition
# and store biased and unbiased results such that the sum of AUC and Recall is minimized

# Value of gamma to use for minimization
gamma = 1.5
key = "STRATIFIED_" + str(gamma).replace(".","")

# Initialize results
unbiased_results[key] = dict()
biased_results[key] = dict()
best_partition = np.random.choice(nums, 1)[0]

# For each partition
for p in tqdm(partitions):
    # Compute the results (AUC and Recall) for both biased and unbiased test sets
    temp_unbiased = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=1, partition=p)
    temp_biased = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=30, partition=p)
    # If first iteration
    # Or if a better partition was found, update the results
    if not unbiased_results[key] or ( temp_unbiased['bias'] + temp_unbiased['concentration'] + temp_biased['bias'] + temp_biased['concentration'] < biased_results[key]['bias'] + biased_results[key]['concentration'] + unbiased_results[key]['bias'] + unbiased_results[key]['concentration'] ) :
        print("Now I have partition ", p, "-> Bias is now: ", temp_unbiased['bias'], " and concentration is now: ", temp_unbiased['concentration'], " for unbiased and ", temp_biased['bias'], " and ", temp_biased['concentration'], " for biased.") 
        unbiased_results[key] = temp_unbiased
        biased_results[key] = temp_biased
        best_partition = p

  0%|          | 0/300 [00:00<?, ?it/s]

Now I have partition  742 -> Bias is now:  86769.13386926711  and concentration is now:  29597.936320479897  for unbiased and  61158.51359095633  and  28333.82152569018  for biased.
Now I have partition  977 -> Bias is now:  60431.176879551356  and concentration is now:  29597.1994832529  for unbiased and  37798.534062019135  and  28333.111189114257  for biased.


So, for the chosen value of gamma, the best partition is...

In [18]:
# Visualize
best_partition

977

Compute stratified metrics with biased and unbiased testset.

In [19]:
unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1, partition=best_partition)
biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30, partition=best_partition)

unbiased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=1, partition=best_partition)
biased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=30, partition=best_partition)

unbiased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=1, partition=best_partition)
biased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=30, partition=best_partition)

unbiased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=1, partition=best_partition)
biased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=30, partition=best_partition)


This version uses the linspace of items instead of linspace of propensities to make the partition.

In [20]:
unbiased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1, partition=best_partition)
biased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30, partition=best_partition)

unbiased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=1, partition=best_partition)
biased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=30, partition=best_partition)

unbiased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=1, partition=best_partition)
biased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=30, partition=best_partition)

unbiased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=1, partition=best_partition)
biased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=30, partition=best_partition)

Prepare table for results.

In [21]:
#len(list(value.keys()))
rows = 4
#len(list(biased_results.items()))
columns = 13

# Init results
results_array = np.zeros((rows,columns))

Fill the table with the MAE results.

In [22]:
# For better readability
def round_if_number(value):
    if isinstance(value, (int, float)):
        return round(value, 5)
    return value

# Get the metrics excluding bias and concentration
metrics = list(biased_results[list(biased_results.keys())[0]].keys())
metrics_for_mae = [metric for metric in metrics if metric not in ['bias', 'concentration']]
num_metrics_for_mae = len(metrics_for_mae)

# Initialize the results array with object dtype to accommodate mixed data types
results_array = np.empty((num_metrics_for_mae + 4, len(biased_results)), dtype=object)  # +4 for 2 bias and 2 concentration

# Get the names of the rows
list_biased_res = list(biased_results.keys())

# For each key in biased_results
for i in range(len(list_biased_res)):
    key = list_biased_res[i]

    # Compute MAE for the metrics (AUC and Recall)
    for j in range(num_metrics_for_mae):
        results_array[j][i] = round_if_number(abs(biased_results[key][metrics_for_mae[j]] - unbiased_results[key][metrics_for_mae[j]]))

    # Add bias and concentration values directly
    results_array[num_metrics_for_mae][i] = round_if_number(biased_results[key]['bias'])
    results_array[num_metrics_for_mae + 1][i] = round_if_number(biased_results[key]['concentration'])
    results_array[num_metrics_for_mae + 2][i] = round_if_number(unbiased_results[key]['bias'])
    results_array[num_metrics_for_mae + 3][i] = round_if_number(unbiased_results[key]['concentration'])

# Add metric names
metric_values = metrics_for_mae + ['biased_bias', 'biased_concentration', 'unbiased_bias', 'unbiased_concentration']

# Make it a DataFrame
mae_df = pd.DataFrame(columns=list_biased_res, data=results_array)
mae_df.insert(0, "metric", metric_values)

# **RESULTS**

In [23]:
# Visualize
mae_df.head(10)

Unnamed: 0,metric,AOA,UB_15,UB_2,UB_25,UB_3,STRATIFIED_15,STRATIFIED_2,STRATIFIED_25,STRATIFIED_3,STRATIFIED_v2_15,STRATIFIED_v2_2,STRATIFIED_v2_25,STRATIFIED_v2_3
0,auc,0.15324,0.1279,0.12466,0.12218,0.1203,0.12485,0.10516,0.04351,0.24531,0.1279,0.12466,0.12218,0.1203
1,recall,0.37667,0.25649,0.24488,0.2363,0.22989,0.25618,0.24952,0.2421,0.21967,0.25649,0.24488,0.2363,0.22989
2,biased_bias,To be compute,Nope,Nope,Nope,Nope,37798.53406,257576.20112,897026.38874,2914954.6877,0.0,0.0,0.0,0.0
3,biased_concentration,Nope,Nope,Nope,Nope,Nope,28333.11119,14507.48042,7027.49807,3828.31383,48114.64423,47931.59355,47825.96046,47762.24962
4,unbiased_bias,To be compute,Nope,Nope,Nope,Nope,60431.17688,325108.41124,1226209.15539,4108874.27145,0.0,0.0,0.0,0.0
5,unbiased_concentration,Nope,Nope,Nope,Nope,Nope,29597.19948,16750.75416,8395.61329,4734.34158,48125.01414,47945.8319,47842.37303,47779.84369
