In [1]:
!pip install recbole ray > None

In [2]:
import numpy as np
import pandas as pd
import dill

import warnings
warnings.filterwarnings('ignore')

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole

import time

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
%cd gdrive/MyDrive/recsys/

/content/gdrive/MyDrive/recsys


In [6]:
SEED = 2022
K_RECOS = 10

## Get data

In [7]:
def load_data():
  interactions = pd.read_csv('kion_train/interactions.csv')
  users = pd.read_csv('kion_train/users.csv')
  items = pd.read_csv('kion_train/items.csv')
  interactions.rename(
    columns={
        'last_watch_dt': 'datetime',
        'total_dur': 'weight',
    },
    inplace=True,
  )

  interactions['datetime'] = pd.to_datetime(interactions['datetime'], format="%Y-%m-%d")
  interactions['timestamp'] = interactions.datetime.values.astype(np.int64) // 10 ** 9

  return interactions, users, items

In [8]:
!mkdir recbox_data

mkdir: cannot create directory ‘recbox_data’: File exists


In [9]:
interactions, users, items = load_data()

In [10]:
df = interactions[['user_id', 'item_id', 'timestamp']].rename(
    columns={
        'user_id': 'user_id:token',
        'item_id': 'item_id:token',
        'timestamp': 'timestamp:float'
    }
)
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

In [None]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 30,
    'eval_args': {
        'split': {'RS': [8, 1, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full',
    },
    'metrics': ['MAP', 'NDCG', 'Recall'],
    'valid_metric': 'MAP@10',
    'topk': 10,
    'seed': 2022,
    'eval_step': 12,
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

init_seed(config['seed'], config['reproducibility'])


init_logger(config)
logger = getLogger()
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

logger.info(config)

In [None]:
dataset = create_dataset(config)
logger.info(dataset)

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)

## Trying to find best model

In [None]:
%%time
model_list = ['MultiVAE', 'CDAE', 'ENMF', 'RecVAE', 'NNCF', 'RaCT'] 

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset='recbox_data', config_dict=parameter_dict)
    t = time.time() - start
    print(f"It took {t / 60:.2f} mins")
    print(result)

running MultiVAE...


Train     0: 100%|███████████████████████████| 7/7 [00:02<00:00,  2.58it/s, GPU RAM: 0.34 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.41it/s, GPU RAM: 0.34 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.82it/s, GPU RAM: 0.34 G/14.76 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.95it/s, GPU RAM: 0.34 G/14.76 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.24it/s, GPU RAM: 0.34 G/14.76 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 17.10it/s, GPU RAM: 0.34 G/14.76 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.21it/s, GPU RAM: 0.34 G/14.76 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.73it/s, GPU RAM: 0.34 G/14.76 G]
Train     8: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.49it/s, GPU RAM: 0.34 G/14.76 G]
Train     9: 100%|███████████████████████████| 7/7 [00:00<00:00, 16.86it/s, GPU RAM: 0.34 G

It took 14.09 mins
{'best_valid_score': 0.0364, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('map@10', 0.0364), ('ndcg@10', 0.0776), ('recall@10', 0.0878)]), 'test_result': OrderedDict([('map@10', 0.0337), ('ndcg@10', 0.0703), ('recall@10', 0.0765)])}
running CDAE...


Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 20.34it/s, GPU RAM: 0.37 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 21.30it/s, GPU RAM: 0.37 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 21.33it/s, GPU RAM: 0.37 G/14.76 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 21.83it/s, GPU RAM: 0.37 G/14.76 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 20.22it/s, GPU RAM: 0.37 G/14.76 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 22.54it/s, GPU RAM: 0.37 G/14.76 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 22.76it/s, GPU RAM: 0.37 G/14.76 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 22.03it/s, GPU RAM: 0.37 G/14.76 G]
Train     8: 100%|███████████████████████████| 7/7 [00:00<00:00, 22.18it/s, GPU RAM: 0.37 G/14.76 G]
Train     9: 100%|███████████████████████████| 7/7 [00:00<00:00, 21.76it/s, GPU RAM: 0.37 G

It took 18.60 mins
{'best_valid_score': 0.0329, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('map@10', 0.0329), ('ndcg@10', 0.0645), ('recall@10', 0.0633)]), 'test_result': OrderedDict([('map@10', 0.0335), ('ndcg@10', 0.0653), ('recall@10', 0.0625)])}
running ENMF...


Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  9.02it/s, GPU RAM: 1.57 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.34it/s, GPU RAM: 1.57 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 10.78it/s, GPU RAM: 1.57 G/14.76 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.15it/s, GPU RAM: 1.57 G/14.76 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.42it/s, GPU RAM: 1.57 G/14.76 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.28it/s, GPU RAM: 1.57 G/14.76 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.48it/s, GPU RAM: 1.57 G/14.76 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.44it/s, GPU RAM: 1.57 G/14.76 G]
Train     8: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.34it/s, GPU RAM: 1.57 G/14.76 G]
Train     9: 100%|███████████████████████████| 7/7 [00:00<00:00, 11.03it/s, GPU RAM: 1.57 G

It took 21.36 mins
{'best_valid_score': 0.0177, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('map@10', 0.0177), ('ndcg@10', 0.0414), ('recall@10', 0.0488)]), 'test_result': OrderedDict([('map@10', 0.0174), ('ndcg@10', 0.0408), ('recall@10', 0.0476)])}
running RecVAE...


Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.40it/s, GPU RAM: 1.59 G/14.76 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.62it/s, GPU RAM: 1.59 G/14.76 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.78it/s, GPU RAM: 1.59 G/14.76 G]
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.77it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.66it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.79it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.80it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.01it/s, GPU RAM: 1.59 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.73it/s, GPU RAM: 1.59 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.70it/s, GPU RAM: 1.59 G

It took 26.40 mins
{'best_valid_score': 0.0391, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('map@10', 0.0391), ('ndcg@10', 0.0829), ('recall@10', 0.0923)]), 'test_result': OrderedDict([('map@10', 0.0359), ('ndcg@10', 0.0749), ('recall@10', 0.0806)])}
running NNCF...


Train     0: 100%|███████████████████████| 677/677 [01:09<00:00,  9.70it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████| 677/677 [01:04<00:00, 10.48it/s, GPU RAM: 1.59 G/14.76 G]
Train     2: 100%|███████████████████████| 677/677 [01:12<00:00,  9.28it/s, GPU RAM: 1.59 G/14.76 G]
Train     3: 100%|███████████████████████| 677/677 [01:08<00:00,  9.93it/s, GPU RAM: 1.59 G/14.76 G]
Train     4: 100%|███████████████████████| 677/677 [01:04<00:00, 10.52it/s, GPU RAM: 1.59 G/14.76 G]
Train     5: 100%|███████████████████████| 677/677 [01:06<00:00, 10.19it/s, GPU RAM: 1.59 G/14.76 G]
Train     6: 100%|███████████████████████| 677/677 [01:12<00:00,  9.28it/s, GPU RAM: 1.59 G/14.76 G]
Train     7: 100%|███████████████████████| 677/677 [01:15<00:00,  8.95it/s, GPU RAM: 1.59 G/14.76 G]
Train     8: 100%|███████████████████████| 677/677 [01:13<00:00,  9.16it/s, GPU RAM: 1.59 G/14.76 G]
Train     9: 100%|███████████████████████| 677/677 [01:14<00:00,  9.04it/s, GPU RAM: 1.59 G

It took 64.75 mins
{'best_valid_score': 0.037, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('map@10', 0.037), ('ndcg@10', 0.0767), ('recall@10', 0.0814)]), 'test_result': OrderedDict([('map@10', 0.0363), ('ndcg@10', 0.0741), ('recall@10', 0.0761)])}
running RaCT...


Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.60it/s, GPU RAM: 1.59 G/14.76 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 13.16it/s, GPU RAM: 1.59 G/14.76 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.89it/s, GPU RAM: 1.59 G/14.76 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.63it/s, GPU RAM: 1.59 G/14.76 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.68it/s, GPU RAM: 1.59 G/14.76 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.89it/s, GPU RAM: 1.59 G/14.76 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.60it/s, GPU RAM: 1.59 G/14.76 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.67it/s, GPU RAM: 1.59 G/14.76 G]
Train     8: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.99it/s, GPU RAM: 1.59 G/14.76 G]
Train     9: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.75it/s, GPU RAM: 1.59 G

It took 11.43 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('map@10', 0.029), ('ndcg@10', 0.0623), ('recall@10', 0.0687)])}
running DGCF...


Train     0: 100%|███████████████████████| 339/339 [06:30<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     1: 100%|███████████████████████| 339/339 [06:30<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     2: 100%|███████████████████████| 339/339 [06:30<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     3: 100%|███████████████████████| 339/339 [06:29<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     4: 100%|███████████████████████| 339/339 [06:30<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     5: 100%|███████████████████████| 339/339 [06:29<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     6: 100%|███████████████████████| 339/339 [06:31<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]
Train     7: 100%|███████████████████████| 339/339 [06:29<00:00,  1.15s/it, GPU RAM: 4.40 G/14.76 G]


The best model is **RecVAE**

## Refit best model

In [14]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 60,
    'eval_args': {
        'split': {'RS': [8, 1, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full',
    },
    'metrics': ['MAP', 'NDCG', 'Recall'],
    'valid_metric': 'MAP@10',
    'topk': 10,
    'seed': 2022,
    'eval_step': 20,
}
config = Config(model='RecVAE', dataset='recbox_data', config_dict=parameter_dict)

init_seed(config['seed'], config['reproducibility'])


init_logger(config)
logger = getLogger()
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

logger.info(config)

In [15]:
dataset = create_dataset(config)
logger.info(dataset)

In [16]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [17]:
result = run_recbole(model='RecVAE', dataset='recbox_data', config_dict=parameter_dict )

Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.48s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.46s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.49s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.40s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.75s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.43s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.43s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.39s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.45s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:10<00:00,  

In [18]:
result

{'best_valid_score': 0.0395,
 'valid_score_bigger': True,
 'best_valid_result': OrderedDict([('map@10', 0.0395),
              ('ndcg@10', 0.0833),
              ('recall@10', 0.0921)]),
 'test_result': OrderedDict([('map@10', 0.0353),
              ('ndcg@10', 0.0738),
              ('recall@10', 0.0792)])}