In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from dataset import UserSessionItemDataset
from evaluation import recall, ndcg, evaluate
from models import EASE, AbsEASE
from pipelines import hyperparameter_selection, run_test

from copy import deepcopy

np.random.seed(12345)

# preprocessing = extract away

In [2]:
DATA_FOLDER = 'data/'

df = pd.read_csv(DATA_FOLDER + 'processedRatingsMovielens.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df.rename(columns={"movieId": "item_id"})

df.head()

Unnamed: 0,userId,item_id,rating,timestamp,datetime,sessionId
0,3,356,1.0,1439472199,2015-08-13 13:23:19,1
1,3,593,1.0,1439472203,2015-08-13 13:23:23,1
2,3,1,1.0,1439472215,2015-08-13 13:23:35,1
3,3,480,-1.0,1439472219,2015-08-13 13:23:39,1
4,3,2571,1.0,1439472221,2015-08-13 13:23:41,1


In [3]:
# create a user_session identifier
df['user_session_id'] = (df['userId'].astype(str) + '_' + df['sessionId'].astype(str)).map(hash)
df = df.sort_values(by=['user_session_id', 'timestamp'])

In [4]:
df.user_session_id.nunique(), df.item_id.nunique(), len(df)

(283453, 47889, 4644545)

In [5]:
testable_sessions = pd.read_csv(DATA_FOLDER + 'testableSessionsMovielens.csv')

testable_sessions.head()

Unnamed: 0,userId,sessionId,minValue,maxValue,ratingsCount,positiveCount
0,3,1,-1.0,1.0,259,248
1,3,3,-1.0,1.0,33,31
2,3,5,-1.0,1.0,53,52
3,4,1,-1.0,1.0,138,100
4,4,3,-1.0,1.0,5,2


In [6]:
len(testable_sessions)

63054

In [7]:
print(testable_sessions.positiveCount.min())
# we need at least 5 positive interactions and 5 negative interactions
testable_sessions["negativeCount"] = testable_sessions["ratingsCount"] - testable_sessions["positiveCount"]
testable_sessions = testable_sessions[(testable_sessions.positiveCount >= 3) & (testable_sessions.negativeCount >= 3)]
len(testable_sessions)

2


34112

In [8]:
# create a user_session identifier
testable_sessions['user_session_id'] = (testable_sessions['userId'].astype(str) + '_' + testable_sessions['sessionId'].astype(str)).map(hash)

In [9]:
# shuffle rows
testable_sessions = testable_sessions.sample(frac=1).reset_index(drop=True)
# add sessions randomly to validation set (0) or one of five CV splits (1-5)
testable_sessions["split"] = testable_sessions.index.values

split_number_to_string = {
    0: "val",  # 25% of sessions are validation
    1: "test",
    2: "test",
    3: "test",
}

testable_sessions["split"] = testable_sessions["split"].apply(lambda x: x % 4).apply(lambda x: split_number_to_string[x])

In [10]:
# check if every row is a unique user session
assert len(testable_sessions) == testable_sessions.user_session_id.nunique()

In [11]:
for split in ["train", "val", "test"]:
    print(f"number of testable sessions in split {split}: {len(testable_sessions[testable_sessions.split == split])}")

number of testable sessions in split train: 0
number of testable sessions in split test: 25584
number of testable sessions in split val: 8528


In [12]:
# map user_session_id to split
user_session_id_to_split = {usid: split for usid, split in zip(testable_sessions.user_session_id.values, testable_sessions.split.values)}

In [13]:
# assign dataframe entries to split
df["split"] = df["user_session_id"].apply(lambda x: user_session_id_to_split.get(x, "train"))

In [None]:
train_df = df[df.split == "train"]
eval_df = df[df.split != "train"]

# to simulate real-world data sparsity, we subsample rows of train df in a stratified manner (per item)
# we modify the long-tail distribution to make it sparser
# set count x of every item to x**(1/1.5)
# max count will be 100, and the distribution will be long tailed
print(train_df.item_id.value_counts().describe())
train_df = train_df.groupby('item_id', group_keys=False).apply(
    lambda x: x.sample(
        min(
            round(len(x)**(2/3)), # at most this number
            100,  # at most 100
        ),
    )
)
print(train_df.item_id.value_counts().describe())
# in training split, keep only interactions with (positive) rating 1
train_df = train_df[train_df.rating == 1.]

df = pd.concat([train_df, eval_df])

# everything up to here should be in preprocessing

## want df with 4 columns: user_session_id, item_id, rating, split
## split = "train", "val", "test"

In [14]:
# create user session enconding and item encoding
user_session_id_to_idx = {user_session_id: idx for idx, user_session_id in enumerate(df['user_session_id'].unique())}
user_session_idx_to_id = {idx: user_session_id for user_session_id, idx in user_session_id_to_idx.items()}

item_id_to_idx = {item_id: idx for idx, item_id in enumerate(df['item_id'].unique())}
item_idx_to_id = {idx: item_id for item_id, idx in item_id_to_idx.items()}

# map values to idx using the above dicts
df["user_session_id"] = df["user_session_id"].map(user_session_id_to_idx)
df["item_id"] = df["item_id"].map(item_id_to_idx)

# get number of unique user_sessions and unique items
n_user_sessions = len(user_session_id_to_idx)
n_items = len(item_id_to_idx)

# instantiate dataset
dataset = UserSessionItemDataset(df[df.split == "train"], df[df.split == "val"], df[df.split == "test"], n_user_sessions, n_items)

In [17]:
l2s = [64., 256., 1024.,]

hyperparameter_selection(dataset, l2s, ndcg, k=100)

count    36130.000000
mean        49.365652
std        330.643594
min          1.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      11858.000000
Name: count, dtype: float64
count    36130.000000
mean         6.269278
std         14.097070
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        100.000000
Name: count, dtype: float64
L2 64.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.10280952670767177 +- 0.0012600981500890316
AbsEASE
ndcg @ 100: 0.11178004817350767 +- 0.0013456606083697574

L2 256.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.10530491136784129 +- 0.0012859555385763435
AbsEASE
ndcg @ 100: 0.10755692755377347 +- 0.0013110915169290455

L2 1024.0
Constructing G...
Density of G: 0.8001%
Inverting G...
EASE
ndcg @ 100: 0.1047011335780279 +- 0.0012814963006538436
AbsEASE
ndcg @ 100: 0.10481405901070356 +- 0.0012883569133610212



best L2 for EASE is 256., for AbsEASE is 64.

In [20]:
models = [("EASE", EASE, 256.), ("AbsEASE", AbsEASE, 64.)]

run_test(models, dataset, ks=[10,20,50,100,200,500])

Split test
count    36130.000000
mean        49.365652
std        330.643594
min          1.000000
25%          1.000000
50%          2.000000
75%          8.000000
max      11858.000000
Name: count, dtype: float64
count    36130.000000
mean         6.269278
std         14.097070
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        100.000000
Name: count, dtype: float64
EASE
Constructing G...
Density of G: 0.7978%
Inverting G...
pos_inputs
recall_liked @ 10: 0.02556310647019935 +- 0.00026663082914651426
recall_disliked @ 10: 0.009355076677092898 +- 0.0002899340020287073
ndcg @ 10: 0.035653033045322295 +- 0.000400107480865015

recall_liked @ 20: 0.044817717075154154 +- 0.00035459659067071817
recall_disliked @ 20: 0.01779817804334826 +- 0.0003967999331259308
ndcg @ 20: 0.0493454844877149 +- 0.0004646559189535226

recall_liked @ 50: 0.09075923608341364 +- 0.0005210615471871608
recall_disliked @ 50: 0.04244437074032526 +- 0.0006137069773672246
