In [1]:
import numpy as np
import pandas as pd

from dataset import UserSessionItemDataset
from evaluation import ndcg
from models import EASE, AbsEASE
from pipelines import hyperparameter_selection, run_test

np.random.seed(12345)

In [2]:
DATA_FOLDER = "./data/"

df = pd.read_csv(DATA_FOLDER + "ratings_processed_MovieLens.csv")

df.head()

Unnamed: 0,userId,item_id,rating,timestamp,datetime,sessionId
0,3,356,1.0,1439472199,2015-08-13 13:23:19,1
1,3,593,1.0,1439472203,2015-08-13 13:23:23,1
2,3,1,1.0,1439472215,2015-08-13 13:23:35,1
3,3,480,-1.0,1439472219,2015-08-13 13:23:39,1
4,3,2571,1.0,1439472221,2015-08-13 13:23:41,1


In [14]:
# keep only columns needed for experiments
COLUMNS = ["user_session_id", "item_id", "rating", "split"]
df = df[COLUMNS]

# create user session enconding and item encoding
user_session_id_to_idx = {user_session_id: idx for idx, user_session_id in enumerate(df['user_session_id'].unique())}
user_session_idx_to_id = {idx: user_session_id for user_session_id, idx in user_session_id_to_idx.items()}

item_id_to_idx = {item_id: idx for idx, item_id in enumerate(df['item_id'].unique())}
item_idx_to_id = {idx: item_id for item_id, idx in item_id_to_idx.items()}

# map values to idx using the above dicts
df["user_session_id"] = df["user_session_id"].map(user_session_id_to_idx)
df["item_id"] = df["item_id"].map(item_id_to_idx)

# get number of unique user_sessions and unique items
n_user_sessions = len(user_session_id_to_idx)
n_items = len(item_id_to_idx)

# instantiate dataset
dataset = UserSessionItemDataset(df[df.split == "train"], df[df.split == "val"], df[df.split == "test"], n_user_sessions, n_items)

In [17]:
l2s = [64., 256., 1024.,]

hyperparameter_selection(dataset, l2s, ndcg, k=100)

count    36130.000000
mean        49.365652
std        330.643594
min          1.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      11858.000000
Name: count, dtype: float64
count    36130.000000
mean         6.269278
std         14.097070
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        100.000000
Name: count, dtype: float64
L2 64.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.10280952670767177 +- 0.0012600981500890316
AbsEASE
ndcg @ 100: 0.11178004817350767 +- 0.0013456606083697574

L2 256.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.10530491136784129 +- 0.0012859555385763435
AbsEASE
ndcg @ 100: 0.10755692755377347 +- 0.0013110915169290455

L2 1024.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.1047011335780279 +- 0.0012814963006538436
AbsEASE
ndcg @ 100: 0.10481405901070356 +- 0.0012883569133610212


best L2 for EASE is 256., for AbsEASE is 64.

In [20]:
models = [("EASE", EASE, 256.), ("AbsEASE", AbsEASE, 64.)]

run_test(models, dataset, ks=[10,20,50,100,200,500])

Split test
count    36130.000000
mean        49.365652
std        330.643594
min          1.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      11858.000000
Name: count, dtype: float64
count    36130.000000
mean         6.269278
std         14.097070
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        100.000000
Name: count, dtype: float64
EASE
Constructing G...
Density of G: 0.7978%
Inverting G...
pos_inputs
recall_liked @ 10: 0.02556310647019935 +- 0.00026663082914651426
recall_disliked @ 10: 0.009355076677092898 +- 0.0002899340020287073
ndcg @ 10: 0.035653033045322295 +- 0.000400107480865015

recall_liked @ 20: 0.044817717075154154 +- 0.00035459659067071817
recall_disliked @ 20: 0.01779817804334826 +- 0.0003967999331259308
ndcg @ 20: 0.0493454844877149 +- 0.0004646559189535226

recall_liked @ 50: 0.09075923608341364 +- 0.0005210615471871608
recall_disliked @ 50: 0.04244437074032526 +- 0.0006137069773672246
