# **IMPORT LIBS**

In [1]:
import numpy as np
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils.evaluators import ImplicitEvalManager
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.recommenders import CML, BPR, PMF
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler
import numpy as np
import pandas as pd
import scipy.sparse as sps
import os
import pickle
import random


# **GENERATE THE DATASET**

In [2]:
seed = 2384795
np.random.seed(seed=seed)

output_name = f"./generated_data/"
folder_name = f"./original_files/"

if os.path.exists(output_name) == False:
    os.makedirs(output_name)

## Load the dataset

In [3]:
file_path = 'big_matrix.csv'
# Load the training set into a DataFrame
df_train = pd.read_csv(folder_name+file_path) 
df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266


## Convert to implicit

"We treat items with a watch_ratio greater than or equal to 2 as relevant, and others as irrelevant, as suggested by KuaiRec."

In [4]:
POSITIVE_THRESHOLD = 2.0 # Suggested on dataset webpage
df_train['ImplicitRating'] = np.where(df_train['watch_ratio'] > POSITIVE_THRESHOLD, 1, 0)

df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397,0
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082,0
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613,0
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885,0
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078,0
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295,0
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398,0
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062,1
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492,0
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266,0


## Check the number of users and items in the training set

"The training set contains 12,530,806 ratings given by 7,176 users against 10,728 videos through natural interactions."

In [5]:
min_user = df_train["user_id"].min()
max_user = df_train["user_id"].max()

min_item = df_train["video_id"].min()
max_item = df_train["video_id"].max()

max_item, max_user

(10727, 7175)

# **GET UNBIASED TESTSET**

## Load the unbiased testset and convert it to implicit

In [6]:
file_path = folder_name + 'small_matrix.csv'
df_test = pd.read_csv(file_path)  # sep='\t' for tab-separated values
df_test['ImplicitRating'] = np.where(df_test['watch_ratio'] > POSITIVE_THRESHOLD, 1, 0)
df_test.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103,0
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377,0
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311,1
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388,0
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364,0
5,14,6789,8607,13267,2020-07-05 05:36:00.773,20200705.0,1593899000.0,0.648753,0
6,14,1963,8613,9590,2020-07-05 05:36:47.741,20200705.0,1593899000.0,0.898123,0
7,14,175,11640,46514,2020-07-05 05:49:27.965,20200705.0,1593899000.0,0.250247,0
8,14,1973,4572,7400,2020-07-05 05:49:41.762,20200705.0,1593899000.0,0.617838,0
9,14,171,8518,5217,2020-07-05 05:57:26.581,20200705.0,1593900000.0,1.632739,0


## Check the number of users and items in the training set

"The testing set is collected by asking a subset of 1,411 users to rate 3,327 randomly selected songs."

In [7]:
df_test['user_id'].unique().shape[0] , df_test["video_id"].unique().shape[0], df_test.shape[0] / (df_test['user_id'].unique().shape[0] * df_test["video_id"].unique().shape[0])

(1411, 3327, 0.9962024941648523)

## Shape the unbiased test set

In [8]:
# From the dataframe, for each row where ImplicitRating is 1, append [userID, itemID] to unbiased_pos_test_set
# and for each row where ImplicitRating is 0, append [userID, itemID] to unbiased_neg_test_set

unbiased_pos_test_set = df_test[df_test["ImplicitRating"] == 1][["user_id", "video_id"]].values
unbiased_neg_test_set = df_test[df_test["ImplicitRating"] == 0][["user_id", "video_id"]].values

## Save unbiased test set

In [9]:
unbiased_pos_test_set_df = pd.DataFrame(unbiased_pos_test_set)
unbiased_neg_test_set_df = pd.DataFrame(unbiased_neg_test_set)

unbiased_pos_test_set_df.columns = ["user_id","item_id"]
unbiased_neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set_unbiased = unbiased_pos_test_set_df.to_records(index=False)
structured_data_neg_test_set_unbiased = unbiased_neg_test_set_df.to_records(index=False)

np.save(output_name + "unbiased-test_arr_pos.npy", structured_data_pos_test_set_unbiased)
np.save(output_name + "unbiased-test_arr_neg.npy", structured_data_neg_test_set_unbiased)

# **GET BIASED TESTSET**

In [10]:
df_train.head(10)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,ImplicitRating
0,0,3649,13838,10867,2020-07-05 00:08:23.438,20200705,1593879000.0,1.273397,0
1,0,9598,13665,10984,2020-07-05 00:13:41.297,20200705,1593879000.0,1.244082,0
2,0,5262,851,7908,2020-07-05 00:16:06.687,20200705,1593879000.0,0.107613,0
3,0,1963,862,9590,2020-07-05 00:20:26.792,20200705,1593880000.0,0.089885,0
4,0,8234,858,11000,2020-07-05 00:43:05.128,20200705,1593881000.0,0.078,0
5,0,8228,13484,8576,2020-07-05 01:00:25.5,20200705,1593882000.0,1.572295,0
6,0,6789,2327,13267,2020-07-05 03:28:02.32,20200705,1593891000.0,0.175398,0
7,0,6812,23731,10728,2020-07-05 22:22:11.813,20200705,1593959000.0,2.212062,1
8,0,183,796,6100,2020-07-06 00:14:06.245,20200706,1593966000.0,0.130492,0
9,0,169,13735,9767,2020-07-06 00:14:48.8,20200706,1593966000.0,1.406266,0


In [11]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12530806 entries, 0 to 12530805
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         int64  
 1   video_id        int64  
 2   play_duration   int64  
 3   video_duration  int64  
 4   time            object 
 5   date            int64  
 6   timestamp       float64
 7   watch_ratio     float64
 8   ImplicitRating  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 860.4+ MB


## Extract the biased test set and shape it

"We additionally held out a biased testing set (biased-testing) from the training set by randomly sampling 30% songs for each user."

In [12]:
# Precompute, for each user, the list of songs with a relevant rating
user_positive_ratings = df_train[df_train["ImplicitRating"] == 1].groupby("user_id")["video_id"].apply(set)

# Initialize the range of indexes for the items
items_ids = np.arange(min_item, max_item + 1)
# Set the number of songs for each user

#Using 3576, that is the 30% of the items in the biased set, to achieve a similar ratio with the Yahoo's dataset
SONGS_FOR_BIASED_TEST = 3576 

pos_test_set = []
neg_test_set = []

for user_id in range(min_user, max_user + 1):
    np.random.shuffle(items_ids)
    test_items = set(items_ids[-SONGS_FOR_BIASED_TEST:])
    pos_ids = user_positive_ratings.get(user_id, set()) & test_items

    #set those to 0 so that they will no longer be used in training set
    df_train.loc[(df_train['video_id'].isin(pos_ids)) & (df_train['user_id'] == user_id), 'ImplicitRating'] = 0

    for id in test_items:
        if id in pos_ids:
            pos_test_set.append([user_id, id])
        else:
            neg_test_set.append([user_id, id])

pos_test_set = np.array(pos_test_set)
neg_test_set = np.array(neg_test_set)

## Save the biased test set

In [13]:
pos_test_set_df = pd.DataFrame(pos_test_set)
neg_test_set_df = pd.DataFrame(neg_test_set)

pos_test_set_df.columns = ["user_id","item_id"]
neg_test_set_df.columns = ["user_id","item_id"]

structured_data_pos_test_set = pos_test_set_df.to_records(index=False)
structured_data_neg_test_set = neg_test_set_df.to_records(index=False)

np.save(output_name + "biased-test_arr_pos.npy", structured_data_pos_test_set)
np.save(output_name + "biased-test_arr_neg.npy", structured_data_neg_test_set)

## **STORE TRAINSET**

## Take couples user-item filtering out the irrelevant ones

In [14]:
# Only take the couples (user, item) with relevant rating
new_df = df_train[df_train['ImplicitRating'] != 0]
new_df = new_df.drop(columns=['watch_ratio', 'ImplicitRating','play_duration','video_duration','time','date','timestamp'])

# Define a dictionary for renaming columns
rename_dict = {
    'user_id': 'user_id',
    'video_id': 'item_id'
}

# Rename the columns
new_df = new_df.rename(columns=rename_dict)

# Convert the DataFrame to a structured array
structured_data = new_df.to_records(index=False) 

In [15]:
train_data = structured_data

## Save the training set

In [16]:
np.save(output_name + "training_arr.npy", train_data)

# **MODEL CHOICE**

In [17]:
raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['max_user'] = 7177
raw_data['max_item'] = 10729
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')

MODEL_CLASS = CML
MODEL_PREFIX = "cml"
DATASET_NAME = "KuaiRec"
OUTPUT_FOLDER = output_name
OUTPUT_PATH = OUTPUT_FOLDER + MODEL_PREFIX + "-" + DATASET_NAME + "/"
OUTPUT_PREFIX = str(OUTPUT_PATH) + str(MODEL_PREFIX) + "-" + str(DATASET_NAME)


if os.path.exists(OUTPUT_PATH) == False:
    os.makedirs(OUTPUT_PATH)


# **TRAIN THE MODEL**

In [18]:
# Avoid tensorflow using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()


model = MODEL_CLASS(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), 
    dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size,
                                     train_dataset=train_dataset, model=model, sampler=sampler,
                                     eval_save_prefix=OUTPUT_PATH + DATASET_NAME,
                                     item_serving_size=500)
auc_evaluator = AUC()


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
keep_dims is deprecated, use keepdims instead

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






2024-05-06 18:35:41.850505: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2024-05-06 18:35:41.856843: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 4192015000 Hz
2024-05-06 18:35:41.857314: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558017e1f530 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2024-05-06 18:35:41.857330: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [19]:
model_trainer.train(num_itr=10001, display_itr=display_itr)

== Start training with FULL evaluation ==
[Itr 100] Finished
[Itr 200] Finished
[Itr 300] Finished
[Itr 400] Finished
[Itr 500] Finished
[Itr 600] Finished
[Itr 700] Finished
[Itr 800] Finished
[Itr 900] Finished
[Itr 1000] Finished
INFO:tensorflow:./generated_data/cml-KuaiRec/KuaiRec-1000 is not in all_model_checkpoint_paths. Manually adding it.
[Itr 1000] loss: 3062.815696
[Itr 1100] Finished
[Itr 1200] Finished
[Itr 1300] Finished
[Itr 1400] Finished
[Itr 1500] Finished
[Itr 1600] Finished
[Itr 1700] Finished
[Itr 1800] Finished
[Itr 1900] Finished
[Itr 2000] Finished
INFO:tensorflow:./generated_data/cml-KuaiRec/KuaiRec-2000 is not in all_model_checkpoint_paths. Manually adding it.
[Itr 2000] loss: 1642.643703
[Itr 2100] Finished
[Itr 2200] Finished
[Itr 2300] Finished
[Itr 2400] Finished
[Itr 2500] Finished
[Itr 2600] Finished
[Itr 2700] Finished
[Itr 2800] Finished
[Itr 2900] Finished
[Itr 3000] Finished
INFO:tensorflow:./generated_data/cml-KuaiRec/KuaiRec-3000 is not in all_model

[Itr 9100] Finished
[Itr 9200] Finished
[Itr 9300] Finished
[Itr 9400] Finished
[Itr 9500] Finished
[Itr 9600] Finished
[Itr 9700] Finished
[Itr 9800] Finished
[Itr 9900] Finished
[Itr 10000] Finished
INFO:tensorflow:./generated_data/cml-KuaiRec/KuaiRec-10000 is not in all_model_checkpoint_paths. Manually adding it.
[Itr 10000] loss: 1194.621136


In [20]:
model.save(OUTPUT_PATH,None)

INFO:tensorflow:./generated_data/cml-KuaiRec/ is not in all_model_checkpoint_paths. Manually adding it.


In [21]:
del model

# **DEFINING FUNCTIONS**

In [22]:
def eq(infilename, infilename_neg, trainfilename, gamma=-1.0, K=1):
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    #
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    #
    Zui = dict()
    Ni = dict()
    # fill in dictionary Ni
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    # fill in dictionary Zui
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))
    # calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0

    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            pui = np.power(Ni[theitem], (gamma + 1) / 2.0)

            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) / pui
            # Calcolo il Recall a 1, vedi nota 6 paper
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 / pui
            denominator += 1 / pui
                

        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [23]:
def aoa(infilename, infilename_neg, trainfilename, K=1):
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]
    #
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])
    #
    Zui = dict()
    Ni = dict()
    # fill in dictionary Ni
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset
    # count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
    # fill in dictionary Zui
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))
    # calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0

        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser]))
            # Calcolo il Recall a 30, vedi nota 6 paper
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0
            denominator += 1 

        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator

    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [24]:
def stratified(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 

    # Maybe try to split the logspace instead of the linspace?
    # logspace = np.logspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)

    linspace = np.linspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        # Is the average the only good choice? even with the log space split?
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg


        j += 1

    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0

    # Compute score with AUC and compute Recall
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [25]:
def stratified_logspace(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the pui[0] and pui[-1] 

    # Maybe try to split the logspace instead of the linspace?
    logspace = np.logspace(pui[items_sorted_by_value[0]], pui[items_sorted_by_value[-1]], partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and pui[items_sorted_by_value[i]] >= logspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        # Is the average the only good choice? even with the log space split?
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg


        j += 1

    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0

    # Compute score with AUC and compute Recall
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count
    }

In [26]:
# This version uses the linspace of the number of number of items used for evaluation, not of the propensities

def stratified_2(infilename, infilename_neg, trainfilename, gamma=1.0, K=30, partition=10):

    # Read pickles
    infile = open(infilename, 'rb')
    infile_neg = open(infilename_neg, 'rb')
    P = pickle.load(infile)
    infile.close()
    P_neg = pickle.load(infile_neg)
    infile_neg.close()
    NUM_NEGATIVES = P["num_negatives"]

    # Merge P and P_neg
    for theuser in P["users"]:
        neg_items = list(P_neg["user_items"][theuser][NUM_NEGATIVES:])
        neg_scores = list(P_neg["results"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(neg_items) + list(P["user_items"][theuser][NUM_NEGATIVES:])
        P["results"][theuser] = list(neg_scores) + list(P["results"][theuser][NUM_NEGATIVES:])

    Zui = dict()
    Ni = dict()

    # Compute frequencies of items in the training set
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset

    # Compute recommendations for each user
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]
        pos_scores = P["results"][theuser][len(P_neg["results"][theuser][NUM_NEGATIVES:]):]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))

    pui = dict()
    w = dict()

    # Compute dictionary of propensity scores
    for theuser in P["users"]:
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            if theitem not in Ni:
                continue
            if theitem in pui:
                continue
            pui[theitem] = np.power(Ni[theitem], (gamma + 1) / 2.0)

    # Take the list of items (not tuples) in pui sorted by value
    items_sorted_by_value = sorted(pui, key=pui.get, reverse=True)

    # Compute linspace between the 0 to len(item_sorted...)
    linspace = np.linspace(0, len(items_sorted_by_value), partition+1)
   
    # Compute dictionary w, that is, for each item, assigns the average of the puis in the partition it belongs to
    i=0
    j = 0
    while i < len(items_sorted_by_value):
                            
        avg = 0
        start = i
        end = i
    
        while i < len(items_sorted_by_value) and i < linspace[j+1]:
            avg += 1.0 / pui[items_sorted_by_value[i]]
            end = i
            i += 1
        
        avg = avg / (end - start + 1)

        for k in range(start, end+1):
            w[items_sorted_by_value[k]] = avg


        j += 1

    nonzero_user_count = 0
    sum_user_auc = 0.0
    sum_user_recall = 0.0

    # Compute score with AUC and compute Recall
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][len(P_neg["user_items"][theuser][NUM_NEGATIVES:]):]:
            # Skip items with null frequency
            if  theitem not in Ni:
                continue
            # Add things to be summed for each item
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) * w[theitem]
            # Add things for recall
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 * w[theitem] # spetta
            # Increment denominator that the sum must be divided by 
            denominator += 1 / pui[theitem]


        # If there was at least one item for the user, count the user and sum the results
        if denominator > 0:
            nonzero_user_count += 1
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator 

    return {
        "auc"       : sum_user_auc / nonzero_user_count, 
        "recall"    : sum_user_recall / nonzero_user_count
    }

# **EVALUATION**

In [27]:
auc_results = []
recall_results = []

In [28]:
raw_data = dict()
raw_data['train_data'] = np.load(output_name + "training_arr.npy")
raw_data['test_data_pos_biased'] = np.load(output_name + "biased-test_arr_pos.npy")
raw_data['test_data_neg_biased'] = np.load(output_name + "biased-test_arr_neg.npy")
raw_data['test_data_pos_unbiased'] = np.load(output_name + "unbiased-test_arr_pos.npy")
raw_data['test_data_neg_unbiased'] = np.load(output_name + "unbiased-test_arr_neg.npy")
raw_data['max_user'] = 7177
raw_data['max_item'] = 10729
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
test_dataset_pos_biased = ImplicitDataset(raw_data['test_data_pos_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_biased = ImplicitDataset(raw_data['test_data_neg_biased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_pos_unbiased = ImplicitDataset(raw_data['test_data_pos_unbiased'], raw_data['max_user'], raw_data['max_item'])
test_dataset_neg_unbiased = ImplicitDataset(raw_data['test_data_neg_unbiased'], raw_data['max_user'], raw_data['max_item'])

In [29]:
#Code to avoid tf using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()

model = MODEL_CLASS(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(),
    dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size,
                                     train_dataset=train_dataset, model=model, sampler=sampler,
                                     eval_save_prefix=OUTPUT_PATH + DATASET_NAME,
                                     item_serving_size=500)
auc_evaluator = AUC()

model.load(OUTPUT_PATH)

INFO:tensorflow:Restoring parameters from ./generated_data/cml-KuaiRec/


In [30]:
model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator])
model_trainer._num_negatives = 200 # Had to increment it, original 200 now?
model_trainer._exclude_positives([train_dataset, test_dataset_pos_biased, test_dataset_neg_biased])
model_trainer._sample_negatives(seed=10)

[Subsampling negative items]


                                                     

## Biased Evaluation

In [31]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-biased"
model_trainer._evaluate_partial(test_dataset_pos_biased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-biased"
model_trainer._evaluate_partial(test_dataset_neg_biased)

100%|██████████| 7157/7157 [00:07<00:00, 929.44it/s]
100%|██████████| 7176/7176 [14:44<00:00,  8.11it/s]


{'AUC': [0.48961394948335246,
  0.49236624928856004,
  0.5162584175084175,
  0.5027956225127914,
  0.5271145745577085,
  0.5065462753950338,
  0.5100141322781232,
  0.48219830028328614,
  0.49902894884766724,
  0.5124756167527252,
  0.5154997184684684,
  0.5084057353776263,
  0.4995945945945946,
  0.5004532415375789,
  0.5026338659938323,
  0.4845082433200682,
  0.4958070866141733,
  0.4937532693984307,
  0.4850842459983151,
  0.4725997191011236,
  0.5373773957158963,
  0.49247406784412673,
  0.5022363584959004,
  0.5051277372262774,
  0.505828659562535,
  0.500458767238953,
  0.48639194139194136,
  0.4926590653153154,
  0.4960225007120478,
  0.48711277761753674,
  0.4866112207499295,
  0.4641259931895573,
  0.5184520034592102,
  0.4925372504919876,
  0.4936882453151618,
  0.5018566591422121,
  0.49874964956546125,
  0.4889414729767572,
  0.5016084611016468,
  0.47496624472573845,
  0.506484440706476,
  0.4875196850393701,
  0.4955734206033011,
  0.510803264604811,
  0.5117566807313643

## Unbiased Evaluation

In [32]:
model_trainer._eval_save_prefix = OUTPUT_PREFIX + "-test-pos-unbiased"
model_trainer._evaluate_partial(test_dataset_pos_unbiased)

model_trainer._eval_save_prefix = OUTPUT_PREFIX +  "-test-neg-unbiased"
model_trainer._evaluate_partial(test_dataset_neg_unbiased)

100%|██████████| 1411/1411 [00:06<00:00, 230.99it/s]
100%|██████████| 1411/1411 [02:19<00:00, 10.10it/s]


{'AUC': [0.5990070019096116,
  0.5000336083104185,
  0.5228904538341157,
  0.6319064124783362,
  0.626116878767582,
  0.5999292960344297,
  0.5904657351962741,
  0.5557896350812636,
  0.5858969276511398,
  0.6148610651670525,
  0.5255388813096863,
  0.5976587990281153,
  0.5290530303030303,
  0.5769832141746969,
  0.5810282503037667,
  0.5539259373079287,
  0.5715790273556232,
  0.6049727272727273,
  0.5735583563535912,
  0.5703200502039536,
  0.558796889295517,
  0.5672933743169398,
  0.602921195652174,
  0.5842520522955306,
  0.538129418997848,
  0.622120101943294,
  0.5883373712901272,
  0.5652935779816514,
  0.5935637480798771,
  0.611801330798479,
  0.5745145330859617,
  0.6437943989071038,
  0.6064132434089516,
  0.5942280923758304,
  0.6318401015228426,
  0.5869348894348895,
  0.5877775819527671,
  0.6534220997857363,
  0.5708310376492194,
  0.6923435137586936,
  0.5949918327344005,
  0.5824754601226994,
  0.5764557356608478,
  0.6341209563994374,
  0.5509882280049567,
  0.57556

## Calculate Metrics

In [33]:
biased_results = dict()

# biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=30, partition=100)
biased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", K=10)
biased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10)
biased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10)
biased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10)
biased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10)

In [34]:
unbiased_results = dict()

# unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=1, partition=100)
unbiased_results["AOA"] = aoa(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", K=10)
unbiased_results["UB_15"] = eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10)
unbiased_results["UB_2"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10)
unbiased_results["UB_25"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10)
unbiased_results["UB_3"] =  eq(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10)

In [35]:
from tqdm.notebook import tqdm

In [36]:
num_items = max_item

In [37]:
nums = np.arange(1, num_items+1)

partitions = np.random.choice(nums, 500, replace=False)
partitions

array([ 9793,  4121,  9688,  9295,  9370,  2772,  9591,  8120,  8225,
        9275,   653,  3018,  8222,  4000,  8220,  9556, 10601,  6794,
        4426,  9371,  6376,  3458,  7109,  8951,  7845,   934,  2483,
        7307,  8536,  2496,  2372,  6176,  2341,  6602,  7643,  1801,
        5711,  3895,  7766,  7637,  9875,  7615,  6727, 10661,   828,
        3670,  1257,  9532,  2689,  5256, 10444,  6569,  5723,  9663,
        3726,   638,  3325,  8420,  1212,  8729,  7693,  3187,   412,
        5554,  2945,  2210,  8813,  7607,   342,  2810,  7641,  2103,
        5338,  7735,  5511, 10368,  1469,  9335,  5384,  2346,  7150,
        8852,  2511,  9668,  2098,  5706,  4495,  6338,  7725,  5550,
        2119,  7552, 10373,  4977,  4541,  5806,  8538,  2266,  2425,
        5961,  9643,  4712,  4217,  4403,  4903,  4565,  2419,   390,
        1351,  3184,   985,  5006,  8417,  2600,  9379,  3589,  1766,
       10400,  6849,  7337, 10296,  6924,  5229,  4476,  3372,  4922,
        8996,  6326,

In [38]:
# Compute biased and unbiased results with stratified for values of partition in (1,2*len(sorted_items))
# and store biased and unbiased results such that abs(biased_results[key]['auc'] - unbiased_results[key]['auc']) + abs(biased_results[key]['recall'] - unbiased_results[key]['recall']) is minimized

#This is the gamma used to compute the best partition
gamma = 15

key = "STRATIFIED_" + str(gamma).replace(".","")

unbiased_results[key] = dict()
biased_results[key] = dict()
best_partition = np.random.choice(nums, 1)[0]

#for p in tqdm(range(1, 2*num_items)):
for p in tqdm(partitions):
    temp_unbiased = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=10, partition=p)
    temp_biased = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=gamma, K=10, partition=p)
    if not unbiased_results[key]:
        unbiased_results[key] = temp_unbiased
    if not biased_results[key]:
        biased_results[key] = temp_biased
    elif abs(temp_biased['auc'] - temp_unbiased['auc']) + abs(temp_unbiased['recall'] - temp_biased['recall']) < abs(biased_results[key]['auc'] - unbiased_results[key]['auc']) + abs(biased_results[key]['recall'] - unbiased_results[key]['recall']):
        biased_results[key]['auc'] = temp_biased['auc']
        biased_results[key]['recall'] = temp_biased['recall']
        unbiased_results[key]['auc'] = temp_unbiased['auc']
        unbiased_results[key]['recall'] = temp_unbiased['recall']
        best_partition = p


  0%|          | 0/500 [00:00<?, ?it/s]

In [39]:
best_partition

3179

In [40]:
unbiased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)
biased_results["STRATIFIED_15"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)
biased_results["STRATIFIED_2"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)

unbiased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)
biased_results["STRATIFIED_25"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)
biased_results["STRATIFIED_3"] = stratified(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)


In [41]:
unbiased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_15"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=1.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_2"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_25"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=2.5, K=10, partition=best_partition)

unbiased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-unbiased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-unbiased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)
biased_results["STRATIFIED_v2_3"] = stratified_2(OUTPUT_PREFIX+"-test-pos-biased_evaluate_partial.pickle", OUTPUT_PREFIX+"-test-neg-biased_evaluate_partial.pickle", output_name+"training_arr.npy", gamma=3, K=10, partition=best_partition)

In [42]:
key, value = random.choice(list(biased_results.items()))
rows = 2#len(list(value.keys()))
columns = 13#len(list(biased_results.items()))
results_array = np.zeros((rows,columns))

In [43]:
mae_results = dict()

list_biased_res = list(biased_results.keys())

for i in range(len(list_biased_res)):
    key = list_biased_res[i]

    for j in range(len(list(biased_results[key].keys()))):
        key_2 = list(biased_results[key].keys())[j]

        results_array[j][i] = abs(biased_results[key][key_2] - unbiased_results[key][key_2])


In [44]:
mae_df = pd.DataFrame(columns=list(biased_results.keys()), data=results_array)

In [45]:
metric_values = list(biased_results[list(biased_results.keys())[0]].keys())
mae_df.insert(0, "metric", metric_values)


# **RESULTS**

In [46]:
mae_df.head()

Unnamed: 0,metric,AOA,UB_15,UB_2,UB_25,UB_3,STRATIFIED_15,STRATIFIED_2,STRATIFIED_25,STRATIFIED_3,STRATIFIED_v2_15,STRATIFIED_v2_2,STRATIFIED_v2_25,STRATIFIED_v2_3
0,auc,0.158301,0.296241,0.313291,0.322564,0.327105,0.363518,0.850604,3.681992,13.350223,0.296238,0.313261,0.322522,0.327064
1,recall,0.016254,0.009118,0.008212,0.007366,0.006642,0.009118,0.008211,0.007365,0.006643,0.009184,0.008264,0.00741,0.006682
