In [1]:
# !pip install recbole 

In [2]:
# pip install ray

In [3]:
# !pip install kmeans_pytorch

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import dill

import warnings
warnings.filterwarnings('ignore')

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole

import time

In [29]:
RANDOM_SEED = 42
K_RECOS = 10

DATA_PATH = "data/kion_train"
RESULTS_PATH = "results/hw5"

# Data

In [30]:
interactions_df = pd.read_csv(f'{DATA_PATH}/interactions.csv', parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={
        "total_dur": "weight", 
        "last_watch_dt": "datetime"
    }, 
    inplace=True) 

users_df = pd.read_csv(f'{DATA_PATH}/users.csv')

items_df = pd.read_csv(f'{DATA_PATH}/items.csv')
items_df = items_df.rename(columns = {'id' : 'item_id'})

In [7]:
interactions_df['t_dat'] = pd.to_datetime(interactions_df['datetime'], format="%Y-%m-%d")
interactions_df['timestamp'] = interactions_df.t_dat.values.astype(np.int64) // 10 ** 9

In [8]:
df = interactions_df[['user_id', 'item_id', 'timestamp']].rename(
    columns={'user_id': 'user_id:token', 'item_id': 'item_id:token', 'timestamp': 'timestamp:float'})

In [9]:
!mkdir recbox_data

mkdir: recbox_data: File exists


In [10]:
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

# Models

In [3]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole

In [12]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'CPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 10,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)

In [13]:
dataset = create_dataset(config)
logger.info(dataset)

16 Mar 15:46    INFO  recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']


In [14]:
train_data, valid_data, test_data = data_preparation(config, dataset)

16 Mar 15:46    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
16 Mar 15:46    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [15]:
%%time
model_list = ['MultiVAE', 'ENMF', 'RecVAE'] # 'MultiVAE', 'CDAE', 'ENMF', 'RecVAE', 'NNCF', 'RaCT'] 


for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset = 'recbox_data',config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


16 Mar 15:46    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 15:46    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 1.87 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.084), ('mrr@10', 0.1695), ('ndcg@10', 0.0825), ('hit@10', 0.3503), ('precision@10', 0.0467)])}
running ENMF...


16 Mar 15:48    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 15:48    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 1.05 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0054), ('mrr@10', 0.0098), ('ndcg@10', 0.0047), ('hit@10', 0.0341), ('precision@10', 0.0035)])}
running RecVAE...


16 Mar 15:49    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 15:49    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 7.74 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0849), ('mrr@10', 0.1697), ('ndcg@10', 0.0828), ('hit@10', 0.3532), ('precision@10', 0.047)])}
CPU times: user 12min 8s, sys: 46.2 s, total: 12min 54s
Wall time: 10min 40s


In [None]:
result = run_recbole(model='MultiVAE', dataset = 'recbox_data',config_dict = parameter_dict )

In [16]:
%%time
model_list_2 = ['NNCF', 'RaCT', 'CDAE'] 

for model_name in model_list_2:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset = 'recbox_data',config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running NNCF...


16 Mar 16:01    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 16:01    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 93.58 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0845), ('mrr@10', 0.1687), ('ndcg@10', 0.0819), ('hit@10', 0.3568), ('precision@10', 0.0471)])}
running RaCT...


16 Mar 17:35    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 17:35    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 20.32 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.084), ('mrr@10', 0.1502), ('ndcg@10', 0.0759), ('hit@10', 0.3603), ('precision@10', 0.0456)])}
running CDAE...


16 Mar 17:55    INFO  ['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
['/Users/elizaveta/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/Users/elizaveta/Library/Jupyter/runtime/kernel-2038ac91-400b-4a4c-8966-f3f6c6acb06c.json']
16 Mar 17:55    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_

It took 0.78 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0634), ('mrr@10', 0.1439), ('ndcg@10', 0.0659), ('hit@10', 0.2723), ('precision@10', 0.0344)])}
CPU times: user 2h 6min 18s, sys: 1min 50s, total: 2h 8min 9s
Wall time: 1h 54min 41s


# Prepare offline predictions

In [4]:
import torch

from recbole.quick_start.quick_start import load_data_and_model
from recbole.utils.case_study import full_sort_topk

In [10]:
def recommend_to_user(user_id, dataset, model):
    if user_id in dataset.field2token_id[dataset.uid_field]:
        model.eval()
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            new_inter = dataset[index]
            new_inter = new_inter.to(config["device"])
            new_scores = model.full_sort_predict(new_inter)
            new_scores = new_scores.view(-1, test_data.dataset.item_num)
            new_scores[:, 0] = -np.inf
            recommended_item_indices = torch.topk(new_scores, 10).indices[0].tolist()
            recos = dataset.id2token(dataset.iid_field, [recommended_item_indices]).tolist()[0]
        return [int(rec) for rec in recos]
    return []

In [5]:
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='saved/RecVAE-Mar-16-2024_15-49-47.pth',
)

16 Mar 19:27    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separa

In [13]:
%%time
recos = {}
users = dataset.field2token_id[dataset.uid_field]
for user_id in users:
    if user_id != '[PAD]':
        recos[int(user_id)] = recommend_to_user(user_id, dataset, model)
    else:
        pass

CPU times: user 1h 10min 20s, sys: 10.3 s, total: 1h 10min 30s
Wall time: 1h 11min 12s


In [14]:
recos

{176549: [13865, 9728, 12173, 7626, 14317, 11310, 1785, 10440, 13018, 5693],
 699317: [7571, 3182, 16166, 13545, 13243, 7582, 11985, 13915, 16270, 4718],
 1016458: [9728, 10440, 13865, 7102, 7829, 4457, 3734, 14431, 1418, 10464],
 896751: [14703, 1844, 13865, 9728, 10440, 12995, 14741, 657, 142, 9169],
 141674: [7571, 3182, 14317, 13018, 9728, 13243, 13865, 7582, 13915, 4696],
 205710: [7626, 7571, 13018, 12173, 14317, 11310, 9728, 16166, 13865, 10942],
 163922: [7571, 3182, 3734, 16166, 13018, 13915, 16270, 9728, 15297, 7582],
 787115: [9728, 1844, 10440, 14703, 142, 13865, 657, 15297, 9169, 14741],
 73446: [9728, 3734, 10440, 15297, 13865, 4151, 7571, 4880, 2657, 13018],
 750995: [10440, 9728, 13865, 142, 14703, 12995, 1844, 4880, 8636, 9996],
 1072552: [9728, 1785, 2237, 5693, 1287, 4457, 11754, 13865, 15464, 10440],
 94135: [12173, 1287, 1785, 9728, 5693, 3095, 11754, 2237, 7626, 7210],
 120361: [15297, 4151, 142, 11778, 10440, 4880, 3734, 1844, 9728, 3935],
 858651: [13243, 13545,

In [16]:
import dill

with open(f"{RESULTS_PATH}/RecVAE.dill", "wb") as f:
    dill.dump(recos, f)