In [1]:
import pandas as pd
import numpy as np
import os

### Load datasets

In [2]:
train_sessions = pd.read_pickle("data/02_train_sessions.pkl")

In [3]:
test_sessions = pd.read_pickle("data/02_test_sessions.pkl")

In [4]:
# Define variables
TIME_KEY = "timestamp"
USER_KEY = "visitorid"
ITEM_KEY = "itemid"
SESSION_KEY = "sessionid"

In [5]:
# Optional -> Remove all test sessions with length = 1
session_lengths = test_sessions.groupby(SESSION_KEY).size()
test_sessions = test_sessions[np.in1d(test_sessions[SESSION_KEY], session_lengths[session_lengths > 1].index)]

### Install requirements

In [6]:
# Install dependencies for HGRU4Rec
!pip install Theano

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting Theano
  Downloading Theano-1.0.5.tar.gz (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m121.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: Theano
  Building wheel for Theano (setup.py) ... [?25ldone
[?25h  Created wheel for Theano: filename=Theano-1.0.5-py3-none-any.whl size=2668111 sha256=72d0bc5de4870885f6a79f0f0991e026abe5d9033d3233a641818f36202fb56f
  Stored in directory: /tmp/pip-ephem-wheel-cache-rpfzwejk/wheels/84/cb/19/235b5b10d89b4621f685112f8762681570a9fa14dc1ce904d9
Successfully built Theano
Installing collected packages: Theano
Successfully installed Theano-1.0.5
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [7]:
# Download HGRU4Rec source
!curl --create-dirs -o ext/hgru4rec.py https://raw.githubusercontent.com/rn5l/session-rec/master/algorithms/hgru4rec/hgru4rec.py
!curl --create-dirs -o ext/accuracy_multiple.py https://raw.githubusercontent.com/rn5l/session-rec/master/evaluation/metrics/accuracy_multiple.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 56006  100 56006    0     0   594k      0 --:--:-- --:--:-- --:--:--  594k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 19401  100 19401    0     0   242k      0 --:--:-- --:--:-- --:--:--  242k


In [8]:
from ext.hgru4rec import HGRU4Rec
from ext.accuracy_multiple import Precision, Recall, MAP, NDCG

### Train HGRU4Rec model
The HGRU4Rec is used from https://github.com/rn5l/session-rec/blob/master/algorithms/hgru4rec/hgru4rec.py

In [9]:
# Setup env variables for Theano
os.environ["THEANO_FLAGS"] = "device=cuda"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["NUMEXPR_MAX_THREADS"] = "8"
os.environ["OMP_NUM_THREADS"] = "8"

In [10]:
# Define Hyperparameter
# used in https://github.com/rn5l/session-rec/blob/master/conf/save/retailrocket/session_aware/single/exp/single_retailrocket_hgru4rec.yml
# {'final_act': 'linear', 'dropout_p_hidden_usr': 0.4, 'dropout_p_hidden_ses': 0.3, 'dropout_p_init': 0.4, 
# 'momentum': 0.3, 'learning_rate': 0.06, 'user_propagation_mode': 'all', 'batch_size': 5}
model = HGRU4Rec(session_layers=100, user_layers=100, n_epochs=10, batch_size=5,
             learning_rate=0.06, momentum=0.3,
             adapt='adagrad', decay=0.9, grad_cap=0, sigma=0,
             dropout_p_hidden_usr=0.4,
             dropout_p_hidden_ses=0.3, dropout_p_init=0.4,
             init_as_normal=False, reset_after_session=True, loss='top1', hidden_act='tanh', final_act='linear',
             train_random_order=False, lmbd=0.0,
             session_key=SESSION_KEY, item_key=ITEM_KEY, time_key=TIME_KEY, user_key=USER_KEY, n_sample=0,
             sample_alpha=0.75,
             user_propagation_mode='all', seed=42)

In [11]:
model.fit(train_sessions, test_sessions)

2022-03-31 13:55:08,619: ext.hgru4rec: INFO: Epoch 0 - train cost: 0.9128
2022-03-31 13:57:05,356: ext.hgru4rec: INFO: Epoch 1 - train cost: 0.8093
2022-03-31 13:59:02,137: ext.hgru4rec: INFO: Epoch 2 - train cost: 0.7119
2022-03-31 14:00:58,802: ext.hgru4rec: INFO: Epoch 3 - train cost: 0.6569
2022-03-31 14:02:55,883: ext.hgru4rec: INFO: Epoch 4 - train cost: 0.6233
2022-03-31 14:04:52,337: ext.hgru4rec: INFO: Epoch 5 - train cost: 0.6041
2022-03-31 14:06:48,510: ext.hgru4rec: INFO: Epoch 6 - train cost: 0.5925
2022-03-31 14:08:45,790: ext.hgru4rec: INFO: Epoch 7 - train cost: 0.5839
2022-03-31 14:10:42,574: ext.hgru4rec: INFO: Epoch 8 - train cost: 0.5789
2022-03-31 14:12:38,979: ext.hgru4rec: INFO: Epoch 9 - train cost: 0.5744


### Evaluate model

In [12]:
# Evaluate performance
eval_actions = len(test_sessions)
eval_sessions = len(test_sessions[SESSION_KEY].unique())
print('Evaluation of ', eval_actions, ' actions in ', eval_sessions, ' sessions')

Evaluation of  36023  actions in  8761  sessions


In [13]:
items_to_predict = train_sessions[ITEM_KEY].unique()

In [14]:
# Use the training sessions of the users in test_data to bootstrap the state of the user RNN
test_users = test_sessions[USER_KEY].unique()
train_data = train_sessions[train_sessions[USER_KEY].isin(test_users)].copy()
test_data = test_sessions.copy()

In [15]:
# Concatenate training and test sessions
train_data['in_eval'] = False
test_data['in_eval'] = True
test_data = pd.concat([train_data, test_data])

In [16]:
test_data.sort_values([USER_KEY, SESSION_KEY, TIME_KEY], inplace=True)
test_data = test_data.reset_index(drop=True)

In [17]:
test_data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,sessionid,in_eval
0,1435607175,75,view,257575,,98,False
1,1435607242,75,view,257575,,98,False
2,1435609434,75,view,257575,,99,False
3,1435609596,75,view,257575,,99,False
4,1435609771,75,view,257575,,99,False


#### Evaluate each session
See also https://github.com/rn5l/session-rec/blob/master/evaluation/evaluation_user_based.py

In [18]:
def evaluate_sessions(model, metrics):
    # Reset metrics
    for m in metrics:
        m.reset();
        
    offset_sessions = np.zeros(test_data[SESSION_KEY].nunique() + 1, dtype=np.int32)
    length_session = np.zeros(test_data[SESSION_KEY].nunique(), dtype=np.int32)
    offset_sessions[1:] = test_data.groupby([USER_KEY, SESSION_KEY]).size().cumsum()
    length_session[0:] = test_data.groupby([USER_KEY, SESSION_KEY]).size()

    count = 0
    current_session_idx = 0
    # pos: to iterate over test data to retrieve the current session and it's first interaction
    pos = offset_sessions[current_session_idx] # index of the first element of the current session in the test data
    position = 0  # position (index) of the current element in the current session
    finished = False

    while not finished:
        # Log output
        if count % 1000 == 0:
            print('eval process: ', count, ' of ', len(test_data), ' actions: ', (count / len(test_data) * 100.0), '%')
        
        # Get current values
        current_item = test_data[ITEM_KEY][pos]
        current_session = test_data[SESSION_KEY][pos]
        current_user = test_data[USER_KEY][pos] # current_user = test_data[user_key][pos] if user_key is not None else -1
        ts = test_data[TIME_KEY][pos]
        rest = test_data[ITEM_KEY][pos + 1:offset_sessions[current_session_idx] + length_session[current_session_idx]].values
        
        # Make predictions
        preds = model.predict_next(current_session, current_item, current_user, items_to_predict, timestamp=ts)
        
        # Replace NANs with 0 and sort
        preds[np.isnan(preds)] = 0
        preds.sort_values(ascending=False, inplace=True)
        
        # Add preds to metric
        if test_data['in_eval'][pos] == True:
            for m in metrics:
                if hasattr(m, 'add_multiple'):
                    m.add_multiple(preds, rest, for_item=current_item, session=current_session, position=position)
                elif hasattr(m, 'add'):
                    m.add(preds, rest[0], for_item=current_item, session=current_session, position=position)
        
        # Increment counters
        count += 1
        pos += 1
        position += 1

        # check if we make prediction for all items of the current session (except the last one)
        if pos + 1 == offset_sessions[current_session_idx] + length_session[current_session_idx]:
            current_session_idx += 1 # start the next session

            if current_session_idx == test_data[SESSION_KEY].nunique(): # if we check all sessions of the test data
                finished = True # finish the evaluation

            # retrieve the index of the first interaction of the next session we want to iterate over
            pos = offset_sessions[current_session_idx]
            position = 0 # reset the first position of the first interaction in the session
            # increment count because of the last item of the session (which we do not make prediction for)
            count += 1
        
    # Collect metrics results
    res = []
    for m in metrics:
        res.append(m.result())
    
    return res

In [19]:
metrics = [Precision(10), Recall(10), MAP(10), NDCG(10)]
result = evaluate_sessions(model, metrics)

eval process:  0  of  291143  actions:  0.0 %
eval process:  1000  of  291143  actions:  0.3434738255771219 %
eval process:  2000  of  291143  actions:  0.6869476511542438 %
eval process:  3000  of  291143  actions:  1.0304214767313655 %
eval process:  4000  of  291143  actions:  1.3738953023084877 %
eval process:  6000  of  291143  actions:  2.060842953462731 %
eval process:  7000  of  291143  actions:  2.404316779039853 %
eval process:  8000  of  291143  actions:  2.7477906046169753 %
eval process:  9000  of  291143  actions:  3.091264430194097 %
eval process:  11000  of  291143  actions:  3.778212081348341 %
eval process:  13000  of  291143  actions:  4.465159732502585 %
eval process:  14000  of  291143  actions:  4.808633558079706 %
eval process:  15000  of  291143  actions:  5.152107383656829 %
eval process:  17000  of  291143  actions:  5.839055034811072 %
eval process:  18000  of  291143  actions:  6.182528860388194 %
eval process:  22000  of  291143  actions:  7.556424162696682

In [20]:
result

[('Precision@10: ', 0.06637077250384146),
 ('Recall@10: ', 0.27442641179515614),
 ('MAP@10: ', 0.030426090308221545),
 ('NDCG@10: ', 0.26166046376079183)]