In [1]:
import pandas as pd
import numpy as np
from time import perf_counter
from tqdm import tqdm
import scipy 
import pickle
import glob
import warnings

import scipy.sparse

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from scipy.sparse import SparseEfficiencyWarning


from metrics import Evaluator, NDCG, NDCG_user, NDCG_MAP, NDCG_AP_user
from my_models import EASE, ParallSynSLIM
from utils_VAE import BaseMultiVAE, TrainableMultVAE, loss_function, naive_sparse2tensor, sparse2torch_sparse

In [2]:
# VARIABLES
min_n_r_users = 200
min_n_r_items = 200
data_path = "/home/mmarzec12/data/"
file_names = ["ratings_chunk_1.csv", "ratings_chunk_2.csv", "ratings_chunk_3.csv",
              "ratings_chunk_4.csv", "ratings_chunk_5.csv"]

savepath = "/home/mmarzec12/models/comparison_literature/"

warnings.simplefilter('ignore',SparseEfficiencyWarning)

## Merging files into 1 file and filtering

In [3]:
cols = ['user_name', 'game_id', 'score', 'timestamp', 'stat_own',
       'stat_preordered', 'stat_wishlist', 'stat_fortrade', 'stat_wanttoplay',
       'stat_prevowned', 'stat_want', 'stat_wanttobuy']
res = pd.DataFrame(columns=cols)
start = perf_counter()
for i, file_name in enumerate(file_names):
    df = pd.read_csv(data_path+file_name)
    filtered = df.drop(["Unnamed: 0"], axis=1)
    res = res.append(filtered)
    end = perf_counter()
    elapsed = end - start
    print("Loading file {} and adding it to dataframe took {} minutes.".format(i+1, round(elapsed/60, 2)))
    start = perf_counter()
print("Final dataframe shape is {}".format(res.shape))
print("We have {} different users (with ratings>0).".format(len(res.user_name.unique())))
print("We have {} different games.".format(len(res.game_id.unique())))

Loading file 1 and adding it to dataframe took 0.2 minutes.
Loading file 2 and adding it to dataframe took 0.21 minutes.
Loading file 3 and adding it to dataframe took 0.22 minutes.
Loading file 4 and adding it to dataframe took 0.23 minutes.
Loading file 5 and adding it to dataframe took 0.19 minutes.
Final dataframe shape is (47623627, 12)
We have 513378 different users (with ratings>0).
We have 107330 different games.


In [4]:
res["game_id"] = res["game_id"].astype(int)
res = res.reset_index()
res = res.drop(["index"], axis=1)
del df

## Removing users and items from tail

In [5]:
i = 1 # iterations counter
# initiate rating count for games
tmp = res[(res.score>0) | (res.stat_own==1) | (res.stat_prevowned==1)].copy()
items_g = tmp.groupby("game_id")["score"].count().reset_index().rename({"score":"n_ratings"}, axis=1)


while items_g.n_ratings.min() < min_n_r_items:
    # filtering games
    filtered = items_g[items_g.n_ratings > min_n_r_items]
    tmp = tmp.merge(filtered, on="game_id", how="inner").drop(["n_ratings"], axis=1)
    diff_i = len(items_g) - len(filtered)
    
    # checking condition for users
    users_g = tmp.groupby("user_name")["score"].count().reset_index().rename({"score":"n_ratings"}, axis=1)
    if users_g.n_ratings.min() < min_n_r_users:
        # filtering users
        filtered = users_g[users_g.n_ratings > min_n_r_users]
        tmp = tmp.merge(filtered, on="user_name", how="inner").drop(["n_ratings"], axis=1)
        diff_u = len(users_g) - len(filtered)
    else:
        break
    
    print("After {} iteration {} users and {} games were removed.".format(i, diff_u, diff_i))
    
    # recalculate ratings for games
    items_g = tmp.groupby("game_id")["score"].count().reset_index().rename({"score":"n_ratings"}, axis=1)
    
    # increase iteration count
    i += 1

print("We have started with {} users and {} games".format(len(res.user_name.unique()), len(res.game_id.unique())))
print("We have finished with {} users and {} games".format(len(tmp.user_name.unique()), len(tmp.game_id.unique())))

del filtered, items_g

After 1 iteration 414684 users and 80415 games were removed.
After 2 iteration 1075 users and 3197 games were removed.
After 3 iteration 38 users and 129 games were removed.
We have started with 513378 users and 107330 games
We have finished with 42790 users and 15613 games


In [None]:
tmp.shape

In [None]:
tmp.head(2)

In [6]:
unique_users = tmp.user_name.unique()
unique_games = tmp.game_id.unique()

n_users, n_items = len(unique_users), len(unique_games)

# dictonaries to map users to unique ids and vice vers
us_to_ids = {u:i for i,u in enumerate(unique_users)}
ids_to_us = {i:u for i,u in enumerate(unique_users)}

# dictonaries to map games to unique ids and vice vers
gs_to_ids = {g:i for i,g in enumerate(unique_games)}
ids_to_gs = {i:g for i,g in enumerate(unique_games)}

# input length for sequencial models (BGG seq rec article)
L = 5

In [None]:
#tmp2.sort_values("timestamp", ascending=True).groupby('user_name', sort=False).agg(lala=("user_name", "cumcount"))

In [7]:
res = tmp.sort_values("timestamp", ascending=True).groupby('user_name', sort=False)\
      .agg(user_interaction_number=("user_name", "cumcount")).join(tmp, how="inner")\
      .sort_values("user_interaction_number", ascending=True)
res["user_interaction_number"] += 1

In [8]:
to_sample = res.shape[0] - n_users * L
ids = np.array(range(n_users * L, res.shape[0]))
np.random.shuffle(ids)
train_ids = list(range(n_users*L)) + ids[:int(0.7*to_sample)].tolist()
test_ids = ids[int(0.7*to_sample):int(0.9*to_sample)]
val_ids = ids[int(0.9*to_sample):]

# train test split (validation will not be used)
train = res.iloc[train_ids,][["user_name", "game_id"]]
test = res.iloc[test_ids,][["user_name", "game_id"]]
val = res.iloc[val_ids,][["user_name", "game_id"]]

In [9]:
# creating sparse matrix with data
row = [us_to_ids[us] for us in train.user_name]
col = [gs_to_ids[g] for g in train.game_id]
data = [1] * train.shape[0]

train_data = scipy.sparse.coo_matrix((data, (row, col)), shape=(len(unique_users), len(unique_games))).tocsr()


tmp = test.groupby("user_name").apply(lambda df: [gs_to_ids[g] for g in df.game_id.tolist()])    
true = {us_to_ids[us]:games for us,games in tmp.iteritems()}

tmp = val.groupby("user_name").apply(lambda df: [gs_to_ids[g] for g in df.game_id.tolist()])
val_dict = {us_to_ids[us]:games for us,games in tmp.iteritems()}

del tmp, res, train

In [50]:
#scipy.sparse.save_npz("/home/mmarzec12/data/"+"train_data_big", train_data)

In [43]:
#with open("/home/mmarzec12/data/"+"test_data_big", "wb") as handle:
#    pickle.dump(true, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open("/home/mmarzec12/data/"+"val_data_big", "wb") as handle:
#    pickle.dump(val_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
#train_data = scipy.sparse.load_npz("/home/mmarzec12/data/"+"train_data_big.npz")
#savepath = "/home/mmarzec12/data/"
#true = pd.read_pickle("/home/mmarzec12/data/"+"test_data_big")

## SLIM

In [10]:
best_params = pd.read_pickle("/home/mmarzec12/models/slim/"+"slim_best_params")
l1_reg = best_params["l1_reg"]
l2_reg = best_params["l2_reg"]

In [11]:
slim = ParallSynSLIM(l1_reg, l2_reg)
        
# train the model
start = perf_counter()
slim.fit(train_data)
end = perf_counter()
print(f"Time elapsed = {round((end-start)/60, 2)} minutes.")

start = 18449.76455276
end = 19151.315333362
Learning all 15613 vectors took 11.71 minutes.
In W matrix we have 2716014 nonzero elements (1.114%).
Time elapsed = 11.72 minutes.


In [12]:
k = 10
recs = slim.calculate_top_k(train_data, ids_to_gs, ids_to_us, k=k)

  0%|          | 0/42790 [00:00<?, ?it/s]

In [13]:
tmp = {us_to_ids[us]:[gs_to_ids[g] for g in games] for us,games in recs.items()}
recs = tmp

In [14]:
k = 10
dict_metrics = NDCG_MAP(k, true, recs)

res = {}
res["model_name"] = "SLIM"
res["min_n_users"] = min_n_r_users
res["min_n_items"] = min_n_r_items
res["NDCG"] = dict_metrics["NDCG"]
res["MAP"] = dict_metrics["MAP"]
res["k"] = k
with open(savepath+"SLIM_200", "wb") as handle:
    pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 42790/42790 [00:00<00:00, 123336.87it/s]


In [15]:
res

{'model_name': 'SLIM',
 'min_n_users': 200,
 'min_n_items': 200,
 'NDCG': 0.4462523373480932,
 'MAP': 0.2824251614566555,
 'k': 10}

## EASE

In [16]:
reg = pd.read_pickle("/home/mmarzec12/models/ease/"+"ease_best_params")
k = 10
gram = train_data.T @ train_data

ease = EASE(regularization=reg)
ease.fit(gram)
del gram
recs = ease.calculate_top_k(train_data, ids_to_gs, ids_to_us, k=k)

tmp = {us_to_ids[us]:[gs_to_ids[g] for g in games] for us,games in recs.items()}
recs = tmp

  0%|          | 0/42790 [00:00<?, ?it/s]

In [17]:
k = 10
dict_metrics = NDCG_MAP(k, true, recs)

res = {}
res["model_name"] = "EASE"
res["min_n_users"] = min_n_r_users
res["min_n_items"] = min_n_r_items
res["NDCG"] = dict_metrics["NDCG"]
res["MAP"] = dict_metrics["MAP"]
res["k"] = k
with open(savepath+"EASE_200", "wb") as handle:
    pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 42790/42790 [00:00<00:00, 120120.95it/s]


In [18]:
res

{'model_name': 'EASE',
 'min_n_users': 200,
 'min_n_items': 200,
 'NDCG': 0.5461252600999648,
 'MAP': 0.37754507061062187,
 'k': 10}

## VAE

In [19]:
best_params = pd.read_pickle("/home/mmarzec12/models/vae/model_tuning/"+"vae_best_params")
base_params = best_params["base_params"]
base_params["encoder_dims"][0] = n_items
base_params["decoder_dims"][-1] = n_items
train_params = best_params["train_params"]
train_params["n_epochs"] = 50

model = TrainableMultVAE(base_params["encoder_dims"], base_params["decoder_dims"], base_params["dropout"])

optimizer = optim.Adam(model.parameters(), **train_params["optimizer_kwargs"])
criterion = loss_function
model.fit(train_data, optimizer, criterion, val_data=None, n_epochs=train_params["n_epochs"],
          k=train_params["k"], beta=train_params["beta"])

recs = model.predict_dict(train_data)

Training phase...
| epoch   1 |  100/ 168 batches | ms/batch 179.21 | loss 2909.59
Training took 29.6 seconds.
Training phase...
| epoch   2 |  100/ 168 batches | ms/batch 176.76 | loss 2752.71
Training took 29.3 seconds.
Training phase...
| epoch   3 |  100/ 168 batches | ms/batch 176.71 | loss 2715.21
Training took 29.31 seconds.
Training phase...
| epoch   4 |  100/ 168 batches | ms/batch 176.62 | loss 2697.63
Training took 29.31 seconds.
Training phase...
| epoch   5 |  100/ 168 batches | ms/batch 176.52 | loss 2684.61
Training took 29.3 seconds.
Training phase...
| epoch   6 |  100/ 168 batches | ms/batch 176.45 | loss 2666.85
Training took 29.27 seconds.
Training phase...
| epoch   7 |  100/ 168 batches | ms/batch 176.38 | loss 2663.76
Training took 29.28 seconds.
Training phase...
| epoch   8 |  100/ 168 batches | ms/batch 176.74 | loss 2666.00
Training took 29.3 seconds.
Training phase...
| epoch   9 |  100/ 168 batches | ms/batch 176.28 | loss 2662.67
Training took 29.31 secon

In [20]:
k = 10
metrics = NDCG_MAP(k, true, recs)

res = {}
res["model_name"] = "VAE"
res["min_n_users"] = min_n_r_users
res["min_n_items"] = min_n_r_items
res["NDCG"] = metrics["NDCG"]
res["MAP"] = metrics["MAP"]
res["k"] = k
with open(savepath+"VAE_200", "wb") as handle:
    pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 42790/42790 [00:00<00:00, 96586.47it/s]


In [21]:
metrics

{'NDCG': 0.4172961831571981, 'MAP': 0.2533637689788075}

In [None]:
games = pd.read_csv("/home/mmarzec12/data/games_data.csv")

In [21]:
def NDCG_AP_user(k, true, predicted, denoms):
    idcg = sum(denoms[:min(len(true),k)])
    true_set = set(true)
    dcg = 0
    n_relevant = 0
    ap = 0
    for i,item in enumerate(predicted):
        if item in true_set:
            dcg += denoms[i]
            n_relevant += 1
            ap += (n_relevant/(i+1))
    
    if n_relevant == 0:
        n_relevant = 1
    return {"NDCGu":dcg / idcg, "APu":ap / n_relevant}
    
def NDCG_MAP(k, true_dict, predicted_dict):
    
    denoms = 1. / np.log2(np.arange(2, k + 2))
    n_users = len(true_dict.keys())
    ndcg_ = 0
    map_ = 0
    
    for uid in tqdm(true_dict.keys()):
        res = NDCG_AP_user(k, true_dict[uid], predicted_dict[uid], denoms)
        ndcg_ += res["NDCGu"]
        map_ + res["APu"]
        
    return {"NDCG": ndcg_/n_users, "MAP": map_/n_users}
    