# Setup
In your bash shell, run:
```bash
conda create --name RecBole python=3.8
conda activate RecBole
git clone https://github.com/RUCAIBox/RecBole.git && cd RecBole
pip install -e . --verbose
conda install ray
conda install -c anaconda jupyter 
conda install hyperopt
conda install -c anaconda ipykernel
python -m ipykernel install --user --name=RecBole
jupyter lab
```

In [1]:
from recbole.quick_start import load_data_and_model
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch
import os

# VAE

In [2]:
os.getcwd()

'/home/marta/jku/kfold/recbole/properties'

The onion dataset used for training the model should be stored as 

```python
os.getcwd() + /dataset/onion/onion.inter
```

In this case `'/home/marta/jku/kfold/recbole/properties/dataset/onion/onion.inter'`

In [3]:
os.listdir(os.getcwd() + '/dataset/onion')
# The third file is the one that MUST be there.

['onion.inter']

The best model has hyperparams:

```python
anneal_cap:1.0, latent_dimension:1000, learning_rate:0.001, mlp_hidden_size:[], total_anneal_steps:5000.0
Valid result:
recall@10 : 0.461    mrr@10 : 0.6899    ndcg@10 : 0.4949    hit@10 : 0.7519    precision@10 : 0.1979
Test result:
recall@10 : 0.4594    mrr@10 : 0.685    ndcg@10 : 0.4955    hit@10 : 0.7424    precision@10 : 0.196
```

The variable `model_path` is the path to where the model is stored.


In [4]:
model_path = '/home/marta/jku/kfold/saved/MultiVAE-Dec-05-2022_17-02-47.pth'

Run this command only once, not every time you get the embeddings

In [5]:
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
)

from logging import getLogger


from recbole.data import (
    create_dataset,
    data_preparation,
    save_split_dataloaders,
    load_split_dataloaders,
)

In [6]:
def load_data_and_model(
    model_file, 
    device='cpu',
):
    r"""Load filtered dataset, split dataloaders and saved model.

    Args:
        model_file (str): The path of saved model file.

    Returns:
        tuple:
            - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`.
            - model (AbstractRecommender): The model load from :attr:`model_file`.
            - dataset (Dataset): The filtered dataset.
            - train_data (AbstractDataLoader): The dataloader for training.
            - valid_data (AbstractDataLoader): The dataloader for validation.
            - test_data (AbstractDataLoader): The dataloader for testing.
    """
    import torch

    checkpoint = torch.load(model_file, map_location=torch.device(device))
    config = checkpoint["config"]
    init_seed(config["seed"], config["reproducibility"])
    init_logger(config)
    logger = getLogger()
    logger.info(config)

    dataset = create_dataset(config)
    logger.info(dataset)
    train_data, valid_data, test_data = data_preparation(config, dataset)

    init_seed(config["seed"], config["reproducibility"])
    model = get_model(config["model"])(config, train_data._dataset).to(device)
    model.load_state_dict(checkpoint["state_dict"])
    model.load_other_parameter(checkpoint.get("other_parameter"))

    return config, model, dataset, train_data, valid_data, test_data


In [7]:
config, model, dataset, _, _, test_data = load_data_and_model(
    model_file=model_path
)


15 Dec 15:31    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/onion
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separat

In [14]:
list(dataset.id2token(dataset.iid_field, np.arange(model.n_items)))

with open('/home/marta/jku/wallmann/coding_2/wallmann_onion/items_above_threshold.pkl', 'wb') as f:
    pickle.dump(items_above_threshold, f)

['[PAD]',
 '26905277',
 '37941692',
 '20289221',
 '34847764',
 '5440568',
 '16244668',
 '17066777',
 '25308630',
 '35727089',
 '14948664',
 '31579245',
 '32650016',
 '2533053',
 '43735960',
 '45926480',
 '10873065',
 '11828220',
 '30775463',
 '22527186',
 '6716156',
 '37423949',
 '24741463',
 '26673008',
 '3351673',
 '29513422',
 '46094145',
 '42006632',
 '18700882',
 '34274002',
 '8405110',
 '10353218',
 '17532668',
 '40483721',
 '2633915',
 '10460617',
 '12150705',
 '21244973',
 '30694781',
 '17088532',
 '26464941',
 '10498359',
 '20209048',
 '6713562',
 '7348943',
 '24867911',
 '21222274',
 '47428625',
 '17683672',
 '41262214',
 '25992559',
 '21084003',
 '44492745',
 '38876497',
 '7022364',
 '11291742',
 '7276924',
 '9837938',
 '32739743',
 '41021483',
 '43965831',
 '7037718',
 '25545448',
 '47136687',
 '11520037',
 '46471994',
 '33209153',
 '5079048',
 '44018018',
 '25209206',
 '30213636',
 '29450686',
 '37203309',
 '10612162',
 '26958343',
 '6627581',
 '26977095',
 '41584879',
 '5

In [8]:
def get_users_embeddings(model, rating_matrix, device='cpu'):
    """
    the embedding of all users as a matrix
    of dimensions rating_matrix_users x self.lat_dim / 2.
    :return:
    """
    model = model.cpu()
    rating_matrix = torch.from_numpy(rating_matrix).float().to('cpu')
    h = F.normalize(rating_matrix)
    h = model.encoder(h).detach()
    mu = h[:, : int(model.lat_dim / 2)]
    return mu

Then, every time you want to get the embeddings for ALL users, run the command:
```python
get_users_embeddings(model, ratings)
```

### EXAMPLE
To show how to use this, I randomly initialize a numpy array which mimics the ratings of all users, with values between 0 and 5.

In [9]:
number_of_users_to_embed = model.n_users

random_ratings = np.random.randint(
    low=0,
    high=5,
    size=(number_of_users_to_embed, model.n_items),
    )

random_ratings = random_ratings.astype(np.float32)

In [10]:
%%time
embeddings = get_users_embeddings(model, random_ratings)
embeddings

CPU times: user 6.7 s, sys: 280 ms, total: 6.98 s
Wall time: 6.99 s


tensor([[-1.2680,  1.4486, -1.3804,  ..., -2.3318, -1.7511,  2.3152],
        [-1.1380,  1.3699, -1.3971,  ..., -2.2866, -1.8190,  2.3561],
        [-1.0934,  1.1066, -1.2829,  ..., -2.3980, -1.7706,  2.2853],
        ...,
        [-1.1269,  1.2225, -1.4497,  ..., -2.3581, -1.8424,  2.3597],
        [-1.2089,  1.3597, -1.5738,  ..., -2.3124, -1.8852,  2.4201],
        [-1.1323,  1.2751, -1.5294,  ..., -2.4074, -1.8562,  2.4560]])

Getting all the embeddings takes roughly 7 seconds on my laptop, 5 seconds on my tower computer.

If needed, you can convert the embeddings to a numpy array: (or call any other method of the class `torch` from `PyTorch`).


In [11]:
embeddings = embeddings.numpy()
embeddings

array([[-1.2679526,  1.4485517, -1.3804485, ..., -2.3318236, -1.7511073,
         2.3152251],
       [-1.1379719,  1.3698629, -1.3970743, ..., -2.2865617, -1.8190303,
         2.3561013],
       [-1.0934396,  1.1066333, -1.2828555, ..., -2.3980165, -1.7706003,
         2.2853076],
       ...,
       [-1.1268995,  1.2225186, -1.4496777, ..., -2.3580883, -1.8424143,
         2.3596866],
       [-1.2089083,  1.3596578, -1.5738242, ..., -2.3124287, -1.8851516,
         2.420097 ],
       [-1.1323131,  1.2750775, -1.5293826, ..., -2.4073868, -1.8561652,
         2.4559958]], dtype=float32)

The shape is (number of users + 1, embedding dimension) because user 0 is an artifact of RecBole.

In [12]:
embeddings.shape

(14325, 500)

# BPR

In [13]:
model_path = '/home/marta/jku/kfold/saved/BPR-Dec-05-2022_16-16-44.pth'

In [14]:
config, model, dataset, _, _, test_data = load_data_and_model(
    model_file=model_path
)


07 Dec 10:55    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/onion
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separat

In [15]:
embeddings = model.get_user_embedding(torch.from_numpy(np.arange(model.n_users))).cpu().detach().numpy()

In [16]:
dataset.id2token(dataset.uid_field, np.arange(model.n_users))

array(['[PAD]', '51918', '70143', ..., '32569', '86959', '21778'],
      dtype='<U6')

In [17]:
bpr_embeddings = pd.DataFrame(embeddings)
bpr_embeddings['user_id'] = dataset.id2token(dataset.uid_field, np.arange(model.n_users))
bpr_embeddings = bpr_embeddings.set_index('user_id')
bpr_embeddings.head()
bpr_embeddings.to_csv('BPR_embeddings.tsv', sep='\t')