# **SETUP**

## Import

In [1]:
from __future__ import division

import numpy as np
import pickle
import os
from openrec.tf1.legacy import ImplicitModelTrainer
from openrec.tf1.legacy.utils import ImplicitDataset
from openrec.tf1.legacy.utils.evaluators import ImplicitEvalManager
from openrec.tf1.legacy.recommenders import CML, BPR
from openrec.tf1.legacy.utils.evaluators import AUC
from openrec.tf1.legacy.utils.samplers import PairwiseSampler

seed = 76424236
np.random.seed(seed=seed)

folder_name = f"./generated_data/"

if os.path.exists(folder_name) == False:
    os.makedirs(folder_name)

# **FUNCTION DEFINITION**

In [2]:
def calc_metrics(infilename, trainfilename, gamma=0.2, K=10):
    
    infile = open(infilename, 'rb')
    P = pickle.load(infile)
    infile.close()
    NUM_NEGATIVES = P["num_negatives"]
    _NUM_POSs = dict()
    
    for theuser in P["users"]:
        _NUM_POSs[theuser] = len(P["user_items"][theuser][NUM_NEGATIVES:])
        P["user_items"][theuser] = list(P["user_items"][theuser])[-300:]
        P["results"][theuser] = list(P["results"][theuser])[-300:]
    
    Zui = dict()
    Ni = dict()
    
    # fill in dictionary Ni
    trainset = np.load(trainfilename)
    for i in trainset['item_id']:
        if i in Ni:
            Ni[i] += 1
        else:
            Ni[i] = 1
    del trainset
    
    # count #users with non-zero item frequencies
    nonzero_user_count = 0
    for theuser in P["users"]:
        pos_items = P["user_items"][theuser][0 - _NUM_POSs[theuser]:]
        for pos_item in pos_items:
            if pos_item in Ni:
                nonzero_user_count += 1
                break
                
    # fill in dictionary Zui
    for theuser in P["users"]:
        all_scores = np.array(P["results"][theuser])
        pos_items = P["user_items"][theuser][0 - _NUM_POSs[theuser]:]
        pos_scores = P["results"][theuser][0 - _NUM_POSs[theuser]:]
        for i, pos_item in enumerate(pos_items):
            pos_score = pos_scores[i]
            Zui[(theuser, pos_item)] = float(np.sum(all_scores > pos_score))
            
    # calculate per-user scores
    sum_user_auc = 0.0
    sum_user_recall = 0.0
    for theuser in P["users"]:
        numerator_auc = 0.0
        numerator_recall = 0.0
        denominator = 0.0
        for theitem in P["user_items"][theuser][0 - _NUM_POSs[theuser]:]:
            if theitem not in Ni:
                continue
            pui = np.power(Ni[theitem], (gamma + 1) / 2.0)
            numerator_auc += (1 - Zui[(theuser, theitem)] / len(P["user_items"][theuser])) / pui
            if Zui[(theuser, theitem)] < K:
                numerator_recall += 1.0 / pui
            denominator += 1 / pui
        if denominator > 0:
            sum_user_auc += numerator_auc / denominator
            sum_user_recall += numerator_recall / denominator
    
    return {
        "auc"       : sum_user_auc / nonzero_user_count,
        "recall"    : sum_user_recall / nonzero_user_count
    }


# **VALIDATE MODEL**

## Load training set

In [4]:
raw_data = dict()
raw_data['train_data'] = np.load(folder_name + "training_arr.npy")
raw_data['val_data'] = np.load(folder_name + "validation_arr.npy")
raw_data['max_user'] = 7177
raw_data['max_item'] = 10729
batch_size = 8000
test_batch_size = 1000
display_itr = 1000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val')

## Define model

In [7]:
# Avoid tensorflow using cached embeddings
import tensorflow as tf
tf.compat.v1.reset_default_graph()

cml_model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), 
    dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size,
                                     train_dataset=train_dataset, model=cml_model, sampler=sampler,
                                     eval_save_prefix=folder_name+"KuaiRec",
                                     item_serving_size=500)
auc_evaluator = AUC()

## Validate

In [8]:
cml_model.load(folder_name+"cml-KuaiRec")

model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator])
model_trainer._num_negatives = 300
model_trainer._exclude_positives([train_dataset, val_dataset])
model_trainer._sample_negatives(seed=10)
model_trainer._eval_save_prefix = folder_name+"cml-KuaiRec-val"
model_trainer._evaluate_partial(val_dataset)

INFO:tensorflow:Restoring parameters from ./generated_data/cml-KuaiRec
[Subsampling negative items]


100%|██████████| 7088/7088 [00:06<00:00, 1150.71it/s]


{'AUC': [0.9611111111111111,
  0.888888888888889,
  0.9217241379310344,
  0.9884848484848486,
  0.9480952380952382,
  0.9427272727272726,
  0.8956989247311827,
  0.8851282051282051,
  0.9123148148148148,
  0.940952380952381,
  0.9732142857142858,
  0.9132432432432432,
  0.9230645161290323,
  0.9062564102564102,
  0.9096899224806202,
  0.8891111111111112,
  0.9492727272727273,
  0.8830476190476192,
  0.99,
  0.8816666666666666,
  0.8972839506172838,
  0.8896153846153846,
  0.8754761904761904,
  0.9226984126984128,
  0.8831944444444444,
  0.9382828282828282,
  0.873391812865497,
  0.9582051282051283,
  0.9672222222222221,
  0.9916666666666667,
  0.8796491228070176,
  0.9301010101010102,
  0.8741111111111111,
  0.8815315315315315,
  0.8610884353741497,
  0.9329268292682925,
  0.942795698924731,
  0.9408888888888889,
  0.8848039215686274,
  0.9564814814814814,
  0.8878260869565218,
  0.9094444444444445,
  0.9195061728395063,
  0.9206190476190478,
  0.9355882352941176,
  0.8333333333333334,