In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('../..')

In [2]:
from pathlib import Path

from hydra import compose, initialize

import omegaconf
import pandas as pd
import torch 
import numpy as np 

from src.models.recbole import RecboleBench
from src.preprocessing import ClassicDataset
from src.utils.logging import get_logger
from src.utils.processing import data_split
from src.utils.metrics import run_all_metrics, coverage

logger = get_logger(name=__name__)

%load_ext autoreload
%autoreload 2

2023-11-02 10:27:33.037251: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-02 10:27:33.271502: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Import recbole Reqired Packages
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.utils import init_seed
from recbole.quick_start import run_recbole

2023-11-02 10:29:34,141	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [7]:
with initialize(config_path='../../config', version_base=None):
    cfg = compose(config_name='config')

In [8]:
recbole_name: str = cfg['library']['name']
cfg_data = cfg['dataset']
cfg_model = cfg['library']['recbole_model']

In [9]:
ds = ClassicDataset()
ds.prepare(cfg_data)

In [10]:
data_split(
            ds.prepared_data, 
            cfg_data, 
            cfg_model, 
            return_format='recbole',
        )  

In [12]:
parameter_dict = omegaconf.OmegaConf.to_container(cfg_model["recbole_params"])

In [13]:
parameter_dict["data_path"] = os.path.join("data", "tmp")
parameter_dict["dataset"] = cfg_data["name"]

In [14]:
config = Config(
            model=cfg_model["name"],
            dataset=cfg_data["name"],
            config_file_list=None,
            config_dict=parameter_dict,
        )

In [36]:
config.final_config_dict

{'gpu_id': '0',
 'worker': 0,
 'use_gpu': True,
 'seed': 2020,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': 'data/tmp/foursquare',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'dataset_save_path': None,
 'save_dataloaders': False,
 'dataloaders_save_path': None,
 'log_wandb': False,
 'wandb_project': 'recbole',
 'shuffle': True,
 'epochs': 300,
 'train_batch_size': 4096,
 'learner': 'adam',
 'learning_rate': 0.001,
 'train_neg_sample_args': {'distribution': 'uniform',
  'sample_num': 1,
  'alpha': 1.0,
  'dynamic': False,
  'candidate_num': 0},
 'eval_step': 1,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'loss_decimal_place': 4,
 'require_pow': False,
 'enable_amp': False,
 'enable_scaler': False,
 'transform': None,
 'eval_args': {'split': {'RS': [0.8, 0.1, 0.1]},
  'group_by': 'user',
  'order': 'RO',
  'mode': 'full'},
 'repeatable': False,
 'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision'],
 'topk': [10],

In [11]:
dataset = create_dataset(config)

In [12]:
train_set, valid_set, test_set = data_preparation(config, dataset)

In [13]:
model_folder = Path('/'.join(
            ('preproc_data', cfg_data['name'], recbole_name, cfg_model['name'])
        ))

In [14]:
model = RecboleBench.initialize_with_params(
                    train_loader=train_set
                )

In [15]:
model.fit(train_set, valid_set)

In [16]:
ranks = model.get_relevant_ranks(test_set)
top_100_items = model.recommend_k(test_set, 100)

  uid_series = torch.tensor(uid_series)


In [17]:
metrics = run_all_metrics(ranks, [5, 10, 20, 100])
coverage_metrics = []
for k in (5, 10, 20, 100):
    coverage_metrics.append(coverage(
        top_100_items,
        test_set.dataset.item_num,
        k
    ))

metrics_df = pd.DataFrame(metrics, index=[5, 10, 20, 100], columns=(
    'Precision@k', 'Recall@K', 'MAP@K', 'nDCG@k', 'MRR@k', 'HitRate@k'
))
metrics_df['Coverage@K'] = coverage_metrics

metrics_df['Time_fit'] = model.learning_time
metrics_df['Time_predict'] = model.predict_time


  0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
# Run recbole
results = run_recbole(model=config['model'], dataset=config['dataset'], config_dict=parameter_dict)

18 Aug 11:05    INFO  ['/home/recsys/miniconda/envs/rec_env/lib/python3.9/site-packages/ipykernel_launcher.py', '--ip=127.0.0.1', '--stdin=9003', '--control=9001', '--hb=9000', '--Session.signature_scheme="hmac-sha256"', '--Session.key=b"f316cac7-a938-41dd-afb1-d08341fb2867"', '--shell=9002', '--transport="tcp"', '--iopub=9004', '--f=/home/recsys/.local/share/jupyter/runtime/kernel-v2-260Bva0VIzzcC3H.json']


18 Aug 11:05    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = data/tmp/kuairec_small
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Precision', 'Recall', 'MAP', 'NDCG', 'MRR', 'HIT', 'ItemCoverage']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters

### Compare recbole buildin training and evaluation and our pipeline

In [19]:
metrics_df

Unnamed: 0,Precision@k,Recall@K,MAP@K,nDCG@k,MRR@k,HitRate@k,Coverage@K,Time_fit,Time_predict
5,0.050461,0.004295,0.025977,0.050854,0.10489,0.212615,0.758076,42.909438,5.446651
10,0.048335,0.008301,0.017647,0.04923,0.123425,0.354359,0.854639,42.909438,5.446651
20,0.050023,0.017173,0.012447,0.05014,0.136934,0.54713,0.883505,42.909438,5.446651
100,0.143452,0.142302,0.018876,0.108498,0.147298,0.931254,0.932302,42.909438,5.446651


In [20]:
results['test_result']

OrderedDict([('precision@10', 0.0483),
             ('recall@10', 0.0083),
             ('map@10', 0.0176),
             ('ndcg@10', 0.0492),
             ('mrr@10', 0.1234),
             ('hit@10', 0.3544),
             ('itemcoverage@10', 0.8546)])

### After training lets make final fit on trainval set

In [21]:
config['benchmark_filename'] = ['trainval', 'val', 'test']

In [22]:
dataset = create_dataset(config)

In [23]:
trainval_set, valid_set, test_set = data_preparation(config, dataset)

18 Aug 11:09    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
18 Aug 11:09    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [24]:
model = RecboleBench.initialize_with_params(
                    train_loader=trainval_set)

In [25]:
model.fit(trainval_set)

18 Aug 11:10    INFO  epoch 0 training [time: 39.63s, train loss: 0.0000]
18 Aug 11:10    INFO  Saving current: saved/EASE-Aug-18-2023_11-10-14.pth


In [26]:
ranks = model.get_relevant_ranks(test_set)
top_100_items = model.recommend_k(test_set, 100)
metrics = run_all_metrics(ranks, [5, 10, 20, 100])
coverage_metrics = []
for k in (5, 10, 20, 100):
    coverage_metrics.append(coverage(
        top_100_items,
        test_set.dataset.item_num,
        k
    ))

metrics_df = pd.DataFrame(metrics, index=[5, 10, 20, 100], columns=(
    'Precision@k', 'Recall@K', 'MAP@K', 'nDCG@k', 'MRR@k', 'HitRate@k'
))
metrics_df['Coverage@K'] = coverage_metrics

metrics_df['Time_fit'] = model.learning_time
metrics_df['Time_predict'] = model.predict_time
metrics_df

  uid_series = torch.tensor(uid_series)


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Precision@k,Recall@K,MAP@K,nDCG@k,MRR@k,HitRate@k,Coverage@K,Time_fit,Time_predict
5,0.112261,0.010068,0.059608,0.111979,0.218545,0.42523,0.807216,39.943185,5.849546
10,0.10815,0.019145,0.04371,0.109343,0.243567,0.612332,0.898625,39.943185,5.849546
20,0.108394,0.038374,0.032961,0.108986,0.255596,0.781715,0.920962,39.943185,5.849546
100,0.239964,0.238163,0.047626,0.191478,0.261505,0.97803,0.941924,39.943185,5.849546
