In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('../..')

In [2]:
from pathlib import Path

from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf
import omegaconf
import pandas as pd
import torch 
import numpy as np 
np.float = float

from src.models.recbole import RecboleBench
from src.preprocessing import ClassicDataset
from src.utils.logging import get_logger
from src.utils.processing import data_split, train_test_split, pandas_to_recbole
from src.utils.metrics import run_all_metrics, coverage

logger = get_logger(name=__name__)

%load_ext autoreload
%autoreload 2

2023-12-26 14:19:06.864649: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-26 14:19:06.911475: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Import sasrec Reqired Packages
import pandas as pd 
import pickle
from sasrec.util import filter_k_core, SASRecDataSet, load_model
from sasrec.model import SASREC
from sasrec.sampler import WarpSampler
import multiprocessing

In [4]:
with initialize(config_path='../../config', version_base=None):
    cfg = compose(config_name='config')

In [5]:
cfg_data = cfg['dataset']
cfg_model = cfg['library']

In [6]:
def load_or_split_data(cfg_data, dataset_folder, dataset):
        if (
            dataset_folder.joinpath("train.parquet").exists() and
            dataset_folder.joinpath("test.parquet").exists()
        ):
            data_train = pd.read_parquet(dataset_folder.joinpath("train.parquet"))
            data_test = pd.read_parquet(dataset_folder.joinpath("test.parquet"))
        else:
            data_train, data_test = train_test_split(
                dataset.prepared_data,
                test_size=cfg_data["splitting"]["test_size"],
                splitting_type=cfg_data["splitting"]["strategy"],
            )
            data_train.to_parquet(dataset_folder.joinpath("train.parquet"))
            data_test.to_parquet(dataset_folder.joinpath("test.parquet"))
        return data_train, data_test

In [7]:
dataset_folder = Path(os.path.join("preproc_data", cfg_data["name"]))
dataset = ClassicDataset()
dataset.prepare(cfg_data)
data_train, data_test = load_or_split_data(cfg_data, dataset_folder, dataset)

In [8]:
df = data_train.rename({'userId':'userID','movieId':'itemID','timestamp':'time'},axis=1)\
       .sort_values(by=['userID','time'])\
       .drop(['rating','time', 'implicit_rating'],axis=1)\
       .reset_index(drop=True)

df.to_csv('sasrec_data.txt', sep="\t", header=False, index=False)

In [9]:
# load data

data = SASRecDataSet('sasrec_data.txt')
data.split() # train, val, test split
              # the last interactions of each user is used for test
              # the last but one will be used for validation
              # others will be used for train

In [10]:
# make model and warmsampler for batch training

max_len = 100
hidden_units = 128
batch_size = 2048

model = SASREC(
    item_num=data.itemnum,
    seq_max_len=max_len,
    num_blocks=2,
    embedding_dim=hidden_units,
    attention_dim=hidden_units,
    attention_num_heads=2,
    dropout_rate=0.2,
    conv_dims = [hidden_units, hidden_units],
    l2_reg=0.00001
)

sampler = WarpSampler(data.user_train, data.usernum, data.itemnum, batch_size=batch_size, maxlen=max_len, n_workers=multiprocessing.cpu_count())


2023-12-26 14:19:11.299770: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 40285 MB memory:  -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:86:00.0, compute capability: 7.5
2023-12-26 14:19:11.300523: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46123 MB memory:  -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:af:00.0, compute capability: 7.5
2023-12-26 14:19:11.301116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 10399 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:3b:00.0, compute capability: 8.6
2023-12-26 14:19:11.301633: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 10399 MB memory:  -> device: 3, name: NVIDIA GeForce RTX 3060, pci bus id: 00

In [11]:
path = ''

In [12]:
# train model

model.train(
          data,
          sampler,
          num_epochs=3, 
          batch_size=batch_size, 
          lr=0.001, 
          val_epoch=1,
          val_target_user_n=1000, 
          target_item_n=-1,
          auto_save=True,
          path = path,
          exp_name='exp_example',
        )

epoch 1 / 3 -----------------------------


  0%|                                            | 0/2 [00:00<?, ?b/s]

In [8]:
pandas_to_recbole(dataset=data_train, dataset_name=cfg_data["name"], split_name="train")
pandas_to_recbole(dataset=data_test, dataset_name=cfg_data["name"], split_name="test")

In [9]:
parameter_dict = OmegaConf.to_container(cfg_model["recbole_params"], resolve=True)
parameter_dict.update(dict(
    data_path = os.path.join("data", "tmp"),
    dataset = cfg_data["name"],
))
        
config = Config(
    model=cfg_model["name"],
    dataset=cfg_data["name"],
    config_file_list=None,
    config_dict=parameter_dict,
)

init_seed(config["seed"], config["reproducibility"])

dataset = create_dataset(config)
train_set, _, test_set = data_preparation(config, dataset)

model_folder = dataset_folder.joinpath(recbole_name, cfg_model["name"])

In [10]:
model = RecboleBench.initialize_with_params(
                    train_loader=train_set
                )

In [11]:
model.fit(train_set)

[1;35mTrain     0[0m: 100%|██████████████████████| 305/305 [00:01<00:00, 244.75it/s, [1;33mGPU RAM: 0.00 G/79.15 G[0m][0m


In [25]:
top_100_items = model.recommend_k(test_set, k=100)

  uid_series = torch.tensor(uid_series)
Generating top_100 recommendations: 8 batch [00:09,  1.22s/ batch]


In [15]:
from src.utils.processing import save_results, train_test_split, pandas_to_sparse, pandas_to_recbole
from src.utils.metrics import run_all_metrics, coverage, novelty, diversity

In [16]:
shape = (
            data_train["userId"].max() + 1,
            data_train["itemId"].max() + 1,
        )
train_interactions, _ = pandas_to_sparse(
            data_train, weighted=True, shape=shape, sparse_type="coo"
        )


In [None]:
train_interactions.shape

(70606, 58449)

In [None]:
data_test.itemId.nunique()

31465

In [29]:
top_100_items


array([[47644,   111, 53345, ..., 54509,   454,  3183],
       [ 1448, 48708,  6451, ..., 10549, 40776,  3122],
       [   30,  6700,  5626, ..., 16750, 54564,  8311],
       ...,
       [56039, 52737, 28349, ...,  5382,  8307, 29204],
       [14550, 30078, 46733, ..., 55327, 57019, 12864],
       [36411, 13491, 37755, ..., 54331,  3012, 34035]])

In [42]:
len(test_items_set)

68084

In [None]:
test_items_set = set()
[test_items_set.update(set(x)) for x in test_set.uid2positive_item if x is not None]

In [53]:
data_test.itemId.unique().max()

58448

In [60]:
np.max(np.concatenate(recbole_test_lists))

58449

In [72]:
t = [58449 in x for x in recbole_test_lists if x is not None]
[i for i, x in enumerate(t) if x]

[23228]

In [89]:
data_train

Unnamed: 0,userId,itemId,timestamp,rating,implicit_rating
0,0,0,1.514765e+09,0.80,1
1409,605,636,1.514765e+09,0.31,1
1410,606,637,1.514765e+09,1.00,1
1411,607,638,1.514765e+09,1.00,1
1414,608,639,1.514765e+09,0.54,1
...,...,...,...,...,...
1370386,318,23153,1.572307e+09,0.40,1
1370379,66872,15687,1.572307e+09,1.00,1
1370362,52011,16928,1.572307e+09,0.98,1
1370380,7941,15687,1.572307e+09,1.00,1


In [95]:
data_test.userId.min()

1

In [101]:
data_train[data_train.userId == 1]

Unnamed: 0,userId,itemId,timestamp,rating,implicit_rating
3,1,1,1514765000.0,1.0,1
178169,1,40322,1522282000.0,0.75,1
234557,1,7001,1524701000.0,1.0,1
677289,1,7637,1543450000.0,0.39,1
751411,1,2760,1546474000.0,0.54,1
1212693,1,17947,1565741000.0,0.88,1
1214939,1,49756,1565827000.0,0.88,1


In [123]:
data_test.userId.nunique()

30127

In [130]:
data_test.userId.unique()

array([25658,  5348, 64081, ..., 47080, 66717, 27452])

In [138]:
[i for i, x in enumerate(data_test.groupby('userId').aggregate('itemId').count() == 73) if x]

[246]

In [146]:
user_item_counts = data_test.groupby('userId')['itemId'].nunique()

# Find the userId with 73 itemId
user_with_73_items = user_item_counts[user_item_counts == 73].index.tolist()

print("UserIds with 73 unique itemId:", user_with_73_items)

UserIds with 73 unique itemId: [607]


In [154]:
realid = set(data_test[data_test.userId == 607].itemId.to_list())

In [161]:
np.sort(data_test[data_test.userId == 607].itemId.to_list())

array([  672,   757,  1119,  3249,  3900,  4208,  4564,  5351,  5687,
        8744,  9729, 10025, 12084, 12735, 13037, 13223, 14520, 14822,
       14873, 16794, 17750, 17777, 18058, 18617, 18710, 19353, 19385,
       19420, 21026, 23154, 23193, 23339, 23715, 24372, 25441, 25529,
       25910, 26467, 28393, 29628, 29695, 31135, 31996, 33614, 35642,
       36218, 37220, 37369, 40639, 40803, 42159, 42571, 42603, 43817,
       44605, 46060, 46103, 46593, 46862, 47843, 49055, 49150, 49161,
       50574, 52205, 52494, 55819, 56221, 57033, 57603, 57614, 58030,
       58081])

In [183]:
dataset.id2token(field=dataset.iid_field, ids=top_100_items).astype(int)

array([37369, 31135, 57614,  3249, 16794, 29695, 52205, 57603, 56221,
       43817, 17750, 33614, 40803, 19385, 14822, 24372, 37220, 55819,
       18710,  1119, 12084, 23193, 36218, 29628, 42159, 13223, 52494,
       58030, 40639, 47843, 23154, 21026, 49150, 17777, 46060, 44605,
       10025,  5687,  9729, 23715, 25441, 46862, 42603, 35642, 50574,
       57033, 23339, 49161, 13037, 42571, 18058, 18617,  8744, 25910,
       46593,  4208, 28393, 14873, 58081, 49055, 19420, 12735,   672,
        5351,   757,  3900,  4564, 19353, 26467, 31996, 25529, 14520,
       46103])

In [26]:
recbole_test_lists = np.array(
            [
                tensor.tolist()
                for tensor in test_set.uid2positive_item
                if tensor is not None
            ],
            dtype=object,
        )
metrics = run_all_metrics(top_100_items, recbole_test_lists, [5, 10, 20, 100])
coverage_metrics, novelty_metrics, diversity_metrics = [], [], []
top_100_items = dataset.id2token(dataset.iid_field, top_100_items).astype(int)
for k in (5, 10, 20, 100):
    coverage_metrics.append(
        coverage(top_100_items, train_interactions.shape[1], k)
    )
    novelty_metrics.append(novelty(top_100_items, train_interactions.tocsc(), k))
    diversity_metrics.append(diversity(top_100_items, train_interactions.tocsc(), k))



metrics_df = pd.DataFrame(
    metrics,
    index=[5, 10, 20, 100],
    columns=(
        "Precision@k",
        "Recall@K",
        "MAP@K",
        "nDCG@k",
        "MRR@k",
        "HitRate@k",
    ),
)
metrics_df["Coverage@K"] = coverage_metrics
# metrics_df["Novelty@K"] = novelty_metrics
# metrics_df["Diversity@k"] = diversity_metrics

metrics_df["Time_fit"] = model.learning_time
metrics_df["Time_predict"] = model.predict_time

Diversity calculation:  15%|█▌        | 4545/30127 [00:14<01:10, 365.25it/s]

In [187]:
top_100_items

array([[47644,   111, 53345, ..., 54509,   454,  3183],
       [ 1448, 48708,  6451, ..., 10549, 40776,  3122],
       [   30,  6700,  5626, ..., 16750, 54564,  8311],
       ...,
       [56039, 52737, 28349, ...,  5382,  8307, 29204],
       [14550, 30078, 46733, ..., 55327, 57019, 12864],
       [36411, 13491, 37755, ..., 54331,  3012, 34035]])

In [12]:
from src.utils.processing import pandas_to_aggregate
def calculate_metrics(top_100_items, data_test, implicit):
        metrics = run_all_metrics(top_100_items, data_test, [5, 10, 20, 100])
        coverage_metrics = []
        for k in (5, 10, 20, 100):
            coverage_metrics.append(coverage(top_100_items, data_test.shape[0], k))

        metrics_df = pd.DataFrame(metrics, index=[5, 10, 20, 100], columns=(
            'Precision@k', 'Recall@K', 'MAP@K', 'nDCG@k', 'MRR@k', 'HitRate@k'
        ))
        metrics_df['Coverage@K'] = coverage_metrics

        metrics_df['Time_fit'] = implicit.learning_time
        metrics_df['Time_predict'] = implicit.predict_time

        return metrics_df

In [13]:
tst = np.array([tensor.tolist() for tensor in test_set.uid2positive_item if tensor is not None], dtype=object)

In [14]:
all_user_list = torch.arange(1, test_set.dataset.user_num)
test_user_list = test_set.uid_list.clone().detach()


In [15]:
test_user_list.shape

torch.Size([4716])

In [16]:
batch = all_user_list[4096 : 4096 * 2]

# Check which users are in the test set
is_in_test = np.isin(batch.numpy(), test_user_list.numpy())

In [17]:
len(batch[is_in_test])

0

In [18]:
top_100_items = model.recommend_k(test_set, k=100)


Generating top_100 recommendations: 0 batch [00:00, ? batch/s]

  uid_series = torch.tensor(uid_series)
Generating top_100 recommendations: 2 batch [00:15,  7.69s/ batch]


In [19]:
metrics_df = calculate_metrics(top_100_items, tst, model)

In [20]:
len(np.unique(top_100_items[:, :10])) / int(test_set._dataset.item_num)

0.1246248392168072

In [114]:
parameter_dict['metrics'] = ['Precision', 'Recall', 'MAP', 'NDCG', 'MRR', 'HIT', 'ItemCoverage']

In [115]:
# Run recbole
results = run_recbole(model=config['model'], dataset=config['dataset'], config_dict=parameter_dict)

14 Dec 10:42    INFO  ['/home/user/conda/envs/recsys/lib/python3.9/site-packages/ipykernel_launcher.py', '--f=/home/jovyan/.local/share/jupyter/runtime/kernel-v2-409310WQGT2PpAxXdm.json']
14 Dec 10:42    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = data/tmp/mts_library
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 300
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'order': 'RO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False


### Compare recbole buildin training and evaluation and our pipeline

In [45]:
print(metrics_df.loc[10])

Precision@k     0.007335
Recall@K        0.006605
MAP@K           0.003264
nDCG@k          0.005560
MRR@k           0.008028
HitRate@k       0.019617
Coverage@K      0.614313
Time_fit        1.671156
Time_predict    9.935722
Name: 10, dtype: float64


In [116]:
results['test_result']

OrderedDict([('precision@10', 0.0031),
             ('recall@10', 0.0066),
             ('map@10', 0.0033),
             ('ndcg@10', 0.0056),
             ('mrr@10', 0.008),
             ('hit@10', 0.0196),
             ('itemcoverage@10', 0.6143)])

### After training lets make final fit on trainval set

In [25]:
config['benchmark_filename'] = ['trainval', 'val', 'test']

In [None]:
dataset = create_dataset(config)

In [None]:
trainval_set, valid_set, test_set = data_preparation(config, dataset)

18 Aug 11:09    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
18 Aug 11:09    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [None]:
model = RecboleBench.initialize_with_params(
                    train_loader=trainval_set)

In [None]:
model.fit(trainval_set)

18 Aug 11:10    INFO  epoch 0 training [time: 39.63s, train loss: 0.0000]
18 Aug 11:10    INFO  Saving current: saved/EASE-Aug-18-2023_11-10-14.pth


In [None]:
ranks = model.get_relevant_ranks(test_set)
top_100_items = model.recommend_k(test_set, 100)
metrics = run_all_metrics(ranks, [5, 10, 20, 100])
coverage_metrics = []
for k in (5, 10, 20, 100):
    coverage_metrics.append(coverage(
        top_100_items,
        test_set.dataset.item_num,
        k
    ))

metrics_df = pd.DataFrame(metrics, index=[5, 10, 20, 100], columns=(
    'Precision@k', 'Recall@K', 'MAP@K', 'nDCG@k', 'MRR@k', 'HitRate@k'
))
metrics_df['Coverage@K'] = coverage_metrics

metrics_df['Time_fit'] = model.learning_time
metrics_df['Time_predict'] = model.predict_time
metrics_df

  uid_series = torch.tensor(uid_series)


  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Precision@k,Recall@K,MAP@K,nDCG@k,MRR@k,HitRate@k,Coverage@K,Time_fit,Time_predict
5,0.112261,0.010068,0.059608,0.111979,0.218545,0.42523,0.807216,39.943185,5.849546
10,0.10815,0.019145,0.04371,0.109343,0.243567,0.612332,0.898625,39.943185,5.849546
20,0.108394,0.038374,0.032961,0.108986,0.255596,0.781715,0.920962,39.943185,5.849546
100,0.239964,0.238163,0.047626,0.191478,0.261505,0.97803,0.941924,39.943185,5.849546
