In [1]:
import numpy as np
import pandas as pd

from dataset import UserSessionItemDataset
from evaluation import ndcg
from models import EASE, AbsEASE
from pipelines import hyperparameter_selection, run_test

np.random.seed(12345)

In [2]:
DATA_FOLDER = "./data/"

df = pd.read_csv(DATA_FOLDER + "ratings_processed_BeerAdvocate.csv")

df.head()

Unnamed: 0,index,brewery_id,brewery_name,timestamp,rating,review_aroma,review_appearance,userId,beer_style,review_palate,review_taste,beer_name,beer_abv,item_id,datetime,sessionId
0,614509,199,Ballast Point Brewing Company,1205561385,1.0,4.0,3.5,0110x011,American Double / Imperial IPA,4.0,4.0,Dorado Double IPA,9.6,10386,2008-03-15 06:09:45,1
1,182729,396,AleSmith Brewing Company,1205823873,1.0,5.0,4.0,0110x011,American Double / Imperial IPA,4.5,4.5,YuleSmith (Summer),8.5,7284,2008-03-18 07:04:33,2
2,1134674,863,Russian River Brewing Company,1207011338,1.0,5.0,5.0,0110x011,American Double / Imperial IPA,4.5,4.5,Pliny The Elder,8.0,7971,2008-04-01 00:55:38,3
3,462893,559,Speakeasy Ales & Lagers,1207362193,1.0,4.0,4.0,0110x011,American Double / Imperial IPA,3.5,4.0,Double Daddy Imperial India Pale Ale,9.5,25283,2008-04-05 02:23:13,4
4,886485,147,Stone Brewing Co.,1208211124,1.0,5.0,4.0,0110x011,American Black Ale,4.0,5.0,Stone Sublimely Self-Righteous Ale,8.7,38470,2008-04-14 22:12:04,5


In [17]:
# keep only columns needed for experiments
COLUMNS = ["user_session_id", "item_id", "rating", "split"]
df = df[COLUMNS]

# create user session enconding and item encoding
user_session_id_to_idx = {user_session_id: idx for idx, user_session_id in enumerate(df['user_session_id'].unique())}
user_session_idx_to_id = {idx: user_session_id for user_session_id, idx in user_session_id_to_idx.items()}

item_id_to_idx = {item_id: idx for idx, item_id in enumerate(df['item_id'].unique())}
item_idx_to_id = {idx: item_id for item_id, idx in item_id_to_idx.items()}

# map values to idx using the above dicts
df["user_session_id"] = df["user_session_id"].map(user_session_id_to_idx)
df["item_id"] = df["item_id"].map(item_id_to_idx)

# get number of unique user_sessions and unique items
n_user_sessions = len(user_session_id_to_idx)
n_items = len(item_id_to_idx)

# instantiate dataset
dataset = UserSessionItemDataset(df[df.split == "train"], df[df.split == "val"], df[df.split == "test"], n_user_sessions, n_items)

In [18]:
l2s = [64., 256., 1024.,]

hyperparameter_selection(dataset, l2s, ndcg, k=100)

L2 64.0
Constructing G...
Density of G: 0.2602%
Inverting G...
EASE
ndcg @ 100: 0.12433571090756053 +- 0.0032288946888075288
AbsEASE
ndcg @ 100: 0.15731621433713747 +- 0.003960681190527962

L2 256.0
Constructing G...
Density of G: 0.2602%
Inverting G...
EASE
ndcg @ 100: 0.13621958212610266 +- 0.0035343284602157724
AbsEASE
ndcg @ 100: 0.15201580606250806 +- 0.003889969431115599

L2 1024.0
Constructing G...
Density of G: 0.2602%
Inverting G...
EASE
ndcg @ 100: 0.14202295547934587 +- 0.0036919280183094227
AbsEASE
ndcg @ 100: 0.14742060563336135 +- 0.0037845749129307815


best L2 for EASE is 1024., for AbsEASE is 64.

In [19]:
models = [("EASE", EASE, 1024.), ("AbsEASE", AbsEASE, 64.)]

run_test(models, dataset, ks=[10,20,50,100,200,500])

Split test
EASE
Constructing G...
Density of G: 0.2602%
Inverting G...
pos_inputs
recall_liked @ 10: 0.021957798436583284 +- 0.0006025498323860573
recall_disliked @ 10: 0.0037197163668640223 +- 0.0007111707248753739
ndcg @ 10: 0.04419161780892222 +- 0.0010531435119450429

recall_liked @ 20: 0.040074191573059234 +- 0.0009339801814301079
recall_disliked @ 20: 0.007239011656870498 +- 0.0009785943592240472
ndcg @ 20: 0.06341842548492532 +- 0.0012832016468521449

recall_liked @ 50: 0.0829221772596738 +- 0.0014458943619883944
recall_disliked @ 50: 0.016319088794178604 +- 0.0014999023298077477
ndcg @ 50: 0.10008262505917082 +- 0.0017115786003953318

recall_liked @ 100: 0.13350399559199042 +- 0.0019848950690388996
recall_disliked @ 100: 0.031201289995167254 +- 0.0020756187849005165
ndcg @ 100: 0.13653119173083766 +- 0.0021492128201289745

recall_liked @ 200: 0.20632883777280042 +- 0.0026118829415214795
recall_disliked @ 200: 0.05519759408318342 +- 0.002734028090183357
ndcg @ 200: 0.18188264688

TypeError: EASE.fit() missing 1 required positional argument: 'X'