In [17]:
from recbole.quick_start import load_data_and_model
import numpy as np
import pandas as pd
import torch.nn.functional as F
import torch
import os

# VAE

In [18]:
os.getcwd()

'/home/marta/jku/RecBole/recbole/properties'

The onion dataset used for training the model should be stored as 

```python
os.getcwd() + /dataset/onion/onion.inter
```

In this case `'/home/marta/jku/RecBole/recbole/properties/dataset/onion/onion.inter'`

In [20]:
os.listdir(os.getcwd() + '/dataset/onion')
# The third file is the one that MUST be there.

['onion_timestamp_thresh.inter', 'onion_binarized_thresh.inter', 'onion.inter']

The best model has hyperparams:

```python
anneal_cap:1.0, latent_dimension:1000, learning_rate:0.001, mlp_hidden_size:[], total_anneal_steps:5000.0
Valid result:
recall@10 : 0.461    mrr@10 : 0.6899    ndcg@10 : 0.4949    hit@10 : 0.7519    precision@10 : 0.1979
Test result:
recall@10 : 0.4594    mrr@10 : 0.685    ndcg@10 : 0.4955    hit@10 : 0.7424    precision@10 : 0.196
```

The variable `model_path` is the path to where the model is stored.


In [22]:
model_path = '/home/marta/jku/RecBole/saved/MultiVAE-Dec-05-2022_17-02-47.pth'

Run this command only once, not every time you get the embeddings

In [23]:
config, model, dataset, _, _, test_data = load_data_and_model(
    model_file=model_path
)


06 Dec 15:31    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/onion
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separat

In [24]:
def get_users_embeddings(model, rating_matrix, device='cpu'):
    """
    the embedding of all users as a matrix
    of dimensions rating_matrix_users x self.lat_dim / 2.
    :return:
    """
    model = model.cpu()
    rating_matrix = torch.from_numpy(rating_matrix).float().to('cpu')
    h = F.normalize(rating_matrix)
    h = model.encoder(h).detach()
    mu = h[:, : int(model.lat_dim / 2)]
    return mu

Then, every time you want to get the embeddings for ALL users, run the command:
```python
get_users_embeddings(model, ratings)
```

### EXAMPLE
To show how to use this, I randomly initialize a numpy array which mimics the ratings of all users, with values between 0 and 5.

In [25]:
number_of_users_to_embed = model.n_users

random_ratings = np.random.randint(
    low=0,
    high=5,
    size=(number_of_users_to_embed, model.n_items),
    )

ratings = ratings.astype(np.float32)

In [27]:
%%time
embeddings = get_users_embeddings(model, random_ratings)
embeddings

CPU times: user 4.55 s, sys: 436 ms, total: 4.98 s
Wall time: 4.98 s


tensor([[-1.2680,  1.4486, -1.3804,  ..., -2.3318, -1.7511,  2.3152],
        [-1.1380,  1.3699, -1.3971,  ..., -2.2866, -1.8190,  2.3561],
        [-1.0934,  1.1066, -1.2829,  ..., -2.3980, -1.7706,  2.2853],
        ...,
        [-1.1269,  1.2225, -1.4497,  ..., -2.3581, -1.8424,  2.3597],
        [-1.2089,  1.3597, -1.5738,  ..., -2.3124, -1.8852,  2.4201],
        [-1.1323,  1.2751, -1.5294,  ..., -2.4074, -1.8562,  2.4560]])

Getting all the embeddings takes roughly 5 seconds.

If needed, you can convert the embeddings to a numpy array: (or call any other method of the class `torch` from `PyTorch`).


In [28]:
embeddings = embeddings.numpy()
embeddings

array([[-1.2679526,  1.4485517, -1.3804485, ..., -2.3318236, -1.7511073,
         2.3152251],
       [-1.1379719,  1.3698629, -1.3970743, ..., -2.2865617, -1.8190303,
         2.3561013],
       [-1.0934396,  1.1066333, -1.2828555, ..., -2.3980165, -1.7706003,
         2.2853076],
       ...,
       [-1.1268995,  1.2225186, -1.4496777, ..., -2.3580883, -1.8424143,
         2.3596866],
       [-1.2089083,  1.3596578, -1.5738242, ..., -2.3124287, -1.8851516,
         2.420097 ],
       [-1.1323131,  1.2750775, -1.5293826, ..., -2.4073868, -1.8561652,
         2.4559958]], dtype=float32)

The shape is (number of users + 1, embedding dimension) because user 0 is an artifact of RecBole.

In [32]:
embeddings.shape

(14325, 500)

# BPR

In [62]:
model_path = '/home/marta/jku/RecBole/saved/BPR-Dec-05-2022_16-16-44.pth'

In [19]:
import torch

In [54]:
embeddings = model.get_user_embedding(torch.from_numpy(np.arange(model.n_users))).cpu().detach().numpy()

In [40]:
dataset.id2token(dataset.uid_field, np.arange(model.n_users))

array(['[PAD]', '51918', '70143', ..., '32569', '86959', '21778'],
      dtype='<U6')

In [57]:
bpr_embeddings = pd.DataFrame(embeddings)
bpr_embeddings['user_id'] = dataset.id2token(dataset.uid_field, np.arange(model.n_users))
bpr_embeddings = bpr_embeddings.set_index('user_id')
bpr_embeddings.head()
bpr_embeddings.to_csv('BPR_embeddings.tsv', sep='\t')