In [1]:
import pandas as pd
import numpy as np

import scipy.sparse
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from tqdm.notebook import tqdm

from metrics import Evaluator
from utils_VAE import BaseMultiVAE, TrainableMultVAE, loss_function, naive_sparse2tensor, sparse2torch_sparse

In [2]:
mypath = "/home/mmarzec12/data/"
savepath = "/home/mmarzec12/models/vae/"

explicit = pd.read_csv(mypath+"explicit_train.csv")
validation = pd.read_csv(mypath+"leave_one_out_validation.csv")


# list with (user,item) tuples from validation set
validation_list = [(u,i) for u,i in zip(validation.user_name, validation.game_id)]
# dict with user:game key-value pairs from validation set
validation_dict = {u:i for u,i in zip(validation.user_name, validation.game_id)}

# unique games and users
unique_users = explicit.user_name.unique()
unique_games = explicit.game_id.unique()

n_users, n_items = len(unique_users), len(unique_games)

# dictonaries to map users to unique ids and vice vers
us_to_ids = {u:i for i,u in enumerate(unique_users)}
ids_to_us = {i:u for i,u in enumerate(unique_users)}

# dictonaries to map games to unique ids and vice vers
gs_to_ids = {g:i for i,g in enumerate(unique_games)}
ids_to_gs = {i:g for i,g in enumerate(unique_games)}


implicit = pd.read_csv(mypath+"implicit_train.csv")
implicit["score"] = 1

# filtering explicit ratings: filter ratings <6 and >=1
print(f"There is {np.sum(explicit.score <= 6)} rows with score <= 6.")
explicit = explicit[explicit.score > 6]

# we join implictit and explicit rating data
joined = pd.concat([explicit, implicit])
joined = joined[["user_name", "game_id", "score"]]
# converting all interaction data to "1" 
joined["score"] = 1

# creating sparse matrix with data
row = [us_to_ids[us] for us in joined.user_name]
col = [gs_to_ids[g] for g in joined.game_id]
data = joined.score

train_data = scipy.sparse.coo_matrix((data, (row, col)), shape=(len(unique_users), len(unique_games))).tocsr()
#item_matrix = user_matrix.T.copy()
#dok_matrix = user_matrix.todok()

user_loc = row
item_loc = col
ratings = data.values

There is 1362961 rows with score <= 6.


In [3]:
latent_dim = 200
dim_layers = [300, 600]
encoder_dims = [n_items] + dim_layers + [latent_dim]
decoder_dims = [latent_dim] + dim_layers[::-1] + [n_items]
n_epochs = 50
k = 5
dropout = 0.3

params = {}

model = TrainableMultVAE(encoder_dims, decoder_dims, dropout=dropout)

optimizer_kwargs = {"weight_decay":0, "lr":5e-4}
# weigth decay==0 means not used
optimizer = optim.Adam(model.parameters(), **optimizer_kwargs)
criterion = loss_function

# preparing validation data
val_data = [(us_to_ids[u], gs_to_ids[i]) for u,i in validation_list]

beta = 0.4

model.fit(train_data, optimizer, criterion, val_data=val_data, n_epochs=n_epochs, k=k)


Training phase...
| epoch   1 |  100/ 427 batches | ms/batch 19.12 | loss 642.20
| epoch   1 |  200/ 427 batches | ms/batch 18.03 | loss 621.39
| epoch   1 |  300/ 427 batches | ms/batch 18.21 | loss 611.36
| epoch   1 |  400/ 427 batches | ms/batch 18.35 | loss 608.24
Training took 7.83 seconds.
Evaluation phase...
Validating took 2.82 seconds.
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 10.65s | NDCG@10 0.128 | ERR@10 0.106 | HR@10 0.195
-----------------------------------------------------------------------------------------
Training phase...
| epoch   2 |  100/ 427 batches | ms/batch 18.49 | loss 609.96
| epoch   2 |  200/ 427 batches | ms/batch 18.34 | loss 605.07
| epoch   2 |  300/ 427 batches | ms/batch 18.15 | loss 596.36
| epoch   2 |  400/ 427 batches | ms/batch 18.28 | loss 602.22
Training took 7.79 seconds.
Evaluation phase...
Validating took 2.82 seconds.
---------------------------------------------

KeyboardInterrupt: 

In [None]:
class BaseMultiVAE(nn.Module):

    def __init__(self, encoder_layers, decoder_layers, dropout=0.5):
        super().__init__()
        
        if encoder_layers[-1] != decoder_layers[0]:
            raise ValueError('Output from Encoder must have the same dimension as input to the Decoder.')
        
        
        # last layer of encoder is for both mean and variance
        encoder_layers[-1] = encoder_layers[-1]*2
        
        self.encoder_dims = encoder_layers
        self.decoder_dims = decoder_layers
        self.latent_dim = decoder_layers[0]
        
        self.encoder_layers = self.initialize_layers(encoder_layers)
        self.decoder_layers = self.initialize_layers(decoder_layers)

        
        self.dropout_ = nn.Dropout(dropout)
    
    
    def initialize_layers(self, layers, nonlinearity="tanh"):
        res = []
        
        for i in range(len(layers)-1):
            layer = nn.Linear(layers[i], layers[i+1])
            self.init_weights(layer)
            res.append(layer)
            
            if i != len(layers)-2:
                res.append(nn.Tanh())

        return nn.Sequential(*res)
    
    
    def forward(self, input_):
        mu, logvar = self.encode(input_)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
    
    def encode(self, input_):
        x = F.normalize(input_)
        x = self.dropout_(x)
        
        for i, layer in enumerate(self.encoder_layers):
            x = layer(x)
            if i == len(self.encoder_layers) - 1:
                mu = x[:, :self.encoder_dims[-1]//2]
                logvar = x[:, self.encoder_dims[-1]//2:]
                
        return mu, logvar

    def reparameterize(self, mu, logvar):
        # when in training mode we sample from 
        # normal distribution
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            # if not in training mode we want to
            # get the mu as reparametrization
            return mu
    
    def decode(self, z):
        x = z
        for layer in self.decoder_layers:
            x = layer(x)
        return x

    def init_weights(self, layer):
        # Xavier Initialization for weights
        size = layer.weight.size()
        fan_out = size[0]
        fan_in = size[1]
        std = np.sqrt(2.0/(fan_in + fan_out))
        layer.weight.data.normal_(0.0, std)

        # Normal Initialization for Biases
        layer.bias.data.normal_(0.0, 0.001)

In [None]:
class TrainableMultVAE(BaseMultiVAE):
    
    def __init__(self, encoder_layers, decoder_layers):
        super().__init__(encoder_layers, decoder_layers)
        
        
    def fit(self, train_data, optimizer, criterion, n_epochs=100, batch_size=256, total_anneal_steps=200000,
              anneal_cap=0.2, log_interval=100, k=10, val_data=None):
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.validation = True if val_data is not None else False
        self.optimizer = optimizer
        self.criterion = criterion
        self.k = k
        self.total_anneal_steps = total_anneal_steps
        self.anneal_cap = anneal_cap
        self.log_interval = log_interval
        self.n_users, self.n_items = train_data.shape
        self.NDCGs = []
        self.ERRs = []
        self.HRs = []
        self.training_time_seconds = []
        self.validating_time_seconds = []
        
        idxlist = list(range(self.n_users))
        
        for epoch in range(n_epochs):
            np.random.shuffle(idxlist)
            
            epoch_start_time = time.time()
            print("Training phase...")
            self.train_one_epoch(train_data, idxlist, epoch_num=epoch+1)
            print(f"Training took {round(time.time() - epoch_start_time,2)} seconds.")
            self.training_time_seconds.append(round(time.time() - epoch_start_time,2))
            
            if self.validation:
                print("Evaluation phase...")
                val_epoch_start_time = time.time()
                ndcg, err, hr = self.validate(train_data, val_data)
                print(f"Validating took {round(time.time() - val_epoch_start_time,2)} seconds.")
                self.validating_time_seconds.append(round(time.time() - val_epoch_start_time,2))
            
            print('-' * 89)
            print('| end of epoch {:3d} | time: {:4.2f}s | '
                    'NDCG@10 {:5.3f} | ERR@10 {:5.3f} | HR@10 {:5.3f}'.format(
                        epoch+1, time.time() - epoch_start_time, ndcg, err, hr))
            print('-' * 89)
            

    def train_one_epoch(self, train_data, idxlist, epoch_num):
        
        self.train()
        train_loss = 0.0
        update_count = 0
        start_time = time.time()
        
        for batch_idx, start_idx in enumerate(range(0, self.n_users, self.batch_size)):
            end_idx = min(start_idx + self.batch_size, self.n_users)
            
            # selecting appriopriate chunk of data and conveting 
            # it to tensors
            data = train_data[idxlist[start_idx:end_idx]]
            data = naive_sparse2tensor(data)
            
            # annealing
            if self.total_anneal_steps > 0:
                anneal = min(self.anneal_cap, 
                                1. * update_count / self.total_anneal_steps)
            else:
                anneal = self.anneal_cap

            self.optimizer.zero_grad()
            recon_batch, mu, logvar = self.forward(data)

            loss = self.criterion(recon_batch, data, mu, logvar, anneal)
            loss.backward()
            train_loss += loss.item()
            self.optimizer.step()

            update_count += 1
            
            if batch_idx % self.log_interval == 0 and batch_idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                        'loss {:4.2f}'.format(
                            epoch_num, batch_idx, len(range(0, self.n_users, self.batch_size)),
                            elapsed * 1000 / self.log_interval,
                            train_loss / self.log_interval))
            
                start_time = time.time()
                train_loss = 0.0
            
    
    def validate(self, train_data, val_data):
        """
        val_data - list with n_users tuples of (user_id, item_id) 
        """
        
        # Evaluation phase
        self.eval()
        val_res = {}
        unique_users = list(range(self.n_users))
        with torch.no_grad():
            for start_idx in range(0, self.n_users, self.batch_size):
                end_idx = min(start_idx+self.batch_size, self.n_users)
                
                data = train_data[start_idx:end_idx]
                data_tensor = naive_sparse2tensor(data)
                
                # predict
                pred, mu, logvar = self.forward(data_tensor)
                # exclude examples from train set
                pred[data.nonzero()] = -float("Inf")
                _, rec = torch.topk(pred, self.k, dim=-1)
                
                # append the results
                uid = start_idx
                for u_rec in rec.numpy():
                    val_res[uid] = u_rec
                    uid+=1
        
        # evaluation 
        ev = Evaluator(k=self.k, true=val_data, predicted=val_res)
        ev.calculate_metrics()
        ndcg, err, hr = ev.ndcg, ev.err, ev.hr
        self.NDCGs.append(ndcg)
        self.ERRs.append(err)
        self.HRs.append(hr)
        return ndcg, err, hr
    
    
    def save_model(self, path):
        torch.save(self.state_dict(), path)
        
    def save_model_params(self, path):
        with open(path, "wb") as handle:
            pickle.dump(self.__dict__, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
encoder_dims = [n_items, 600, 200]
decoder_dims = [200, 600, n_items]
model = BaseMultiVAE(encoder_dims, decoder_dims)

In [None]:
model.encoder_layers

In [None]:
model.decoder_layers

In [None]:
encoder_dims = [n_items, 600, 200]
decoder_dims = [200, 600, n_items]

model = TrainableMultVAE(encoder_dims, decoder_dims)

optimizer_kwargs = {"weight_decay":0, "lr":5e-4}
# weigth decay==0 means not used
optimizer = optim.Adam(model.parameters(), **optimizer_kwargs)
criterion = loss_function

# preparing validation data
val_data = [(us_to_ids[u], gs_to_ids[i]) for u,i in validation_list]


model.fit(train_data, optimizer, criterion, val_data=val_data)

In [None]:
#m = torch.load(savepath+"vae1")

In [None]:
#model2 = TrainableMultVAE(*args, **kwargs)