In [None]:
%%capture
!pip install kmeans_pytorch ray recbole

In [5]:
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import ast
import json
import logging
import os
import pickle
import time
import warnings
from collections import Counter
from logging import getLogger
from pathlib import Path
from random import randint, random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from google.colab import drive
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import Caser, GRU4Rec
from recbole.quick_start import run_recbole
from recbole.trainer import Trainer
from recbole.utils import init_logger, init_seed
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import (
    cosine_distances,
    cosine_similarity,
    euclidean_distances,
)

In [4]:
warnings.filterwarnings("ignore")

# Load data

In [6]:
interactions_df = pd.read_csv('/content/drive/MyDrive/data_original/interactions_processed_kion.csv')
users_df = pd.read_csv('/content/drive/MyDrive/data_original/users_processed_kion.csv')
items_df = pd.read_csv('/content/drive/MyDrive/data_original/items_processed_kion.csv')

In [7]:
interactions_df["t_dat"] = pd.to_datetime(interactions_df["last_watch_dt"], format="%Y-%m-%d")
interactions_df["timestamp"] = interactions_df.t_dat.values.astype(np.int64) // 10**9

In [8]:
df = interactions_df[["user_id", "item_id", "timestamp"]].rename(
    columns={"user_id": "user_id:token", "item_id": "item_id:token", "timestamp": "timestamp:float"}
)

In [9]:
!mkdir recbox_data

In [10]:
df.to_csv("recbox_data/recbox_data.inter", index=False, sep="\t")

# Обучение моделей

In [11]:
parameter_dict = {
    "data_path": "",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "TIME_FIELD": "timestamp",
    "device": "GPU",
    "user_inter_num_interval": "[40,inf)",
    "item_inter_num_interval": "[40,inf)",
    "load_col": {"inter": ["user_id", "item_id", "timestamp"]},
    "neg_sampling": None,
    "epochs": 10,
    "eval_args": {"split": {"RS": [9, 0, 1]}, "group_by": "user", "order": "TO", "mode": "full"},
}
config = Config(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

# init random seed
init_seed(config["seed"], config["reproducibility"])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [12]:
dataset = create_dataset(config)
logger.info(dataset)

In [13]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

## Выбор архитектуры

In [14]:
%%time
model_list = ["MultiVAE", "MultiDAE", "MacridVAE", "NeuMF"]

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset="recbox_data", config_dict=parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-448bdf4a-8481-485c-b525-87157a440bc9.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.79it/s, GPU RAM: 0.38 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.85it/s, GPU RAM: 0.38 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.05it/s, GPU RAM: 0.38 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.07it/s, GPU RAM: 0.38 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.53it/s, GPU RAM: 0.38 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.14it/s, GPU RAM: 0.38 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.06it/s, GPU RAM: 0.38 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

It took 3.33 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0834), ('mrr@10', 0.1671), ('ndcg@10', 0.0816), ('hit@10', 0.3466), ('precision@10', 0.0462)])}
running MultiDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-448bdf4a-8481-485c-b525-87157a440bc9.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.75it/s, GPU RAM: 0.38 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 15.04it/s, GPU RAM: 0.38 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.46it/s, GPU RAM: 0.38 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.12it/s, GPU RAM: 0.38 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00,  7.06it/s, GPU RAM: 0.38 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.36it/s, GPU RAM: 0.38 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00,  8.09it/s, GPU RAM: 0.38 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

It took 4.46 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0837), ('mrr@10', 0.1657), ('ndcg@10', 0.0814), ('hit@10', 0.3466), ('precision@10', 0.0463)])}
running MacridVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-448bdf4a-8481-485c-b525-87157a440bc9.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:07<00:00,  1.08s/it, GPU RAM: 0.95 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:24<00:00,  3.45s/it, GPU RAM: 0.95 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:05<00:00,  1.21it/s, GPU RAM: 0.95 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:01<00:00,  3.67it/s, GPU RAM: 0.95 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:01<00:00,  3.66it/s, GPU RAM: 0.95 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:01<00:00,  3.63it/s, GPU RAM: 0.95 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:01<00:00,  3.68it/s, GPU RAM: 0.95 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:02<00:00,  

It took 9.06 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0827), ('mrr@10', 0.1548), ('ndcg@10', 0.0775), ('hit@10', 0.3469), ('precision@10', 0.0455)])}
running NeuMF...


command line args [-f /root/.local/share/jupyter/runtime/kernel-448bdf4a-8481-485c-b525-87157a440bc9.json] will not be used in RecBole
Train     0: 100%|███████████████████████| 755/755 [00:36<00:00, 20.50it/s, GPU RAM: 0.95 G/14.75 G]
Train     1: 100%|███████████████████████| 755/755 [00:36<00:00, 20.84it/s, GPU RAM: 0.95 G/14.75 G]
Train     2: 100%|███████████████████████| 755/755 [00:37<00:00, 20.37it/s, GPU RAM: 0.95 G/14.75 G]
Train     3: 100%|███████████████████████| 755/755 [00:37<00:00, 20.20it/s, GPU RAM: 0.95 G/14.75 G]
Train     4: 100%|███████████████████████| 755/755 [00:36<00:00, 20.65it/s, GPU RAM: 0.95 G/14.75 G]
Train     5: 100%|███████████████████████| 755/755 [00:36<00:00, 20.89it/s, GPU RAM: 0.95 G/14.75 G]
Train     6: 100%|███████████████████████| 755/755 [00:36<00:00, 20.82it/s, GPU RAM: 0.95 G/14.75 G]
Train     7: 100%|███████████████████████| 755/755 [00:35<00:00, 21.05it/s, GPU RAM: 0.95 G/14.75 G]
Train     8: 100%|███████████████████████| 755/755 [00:36

It took 10.59 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0687), ('mrr@10', 0.1181), ('ndcg@10', 0.0607), ('hit@10', 0.3008), ('precision@10', 0.038)])}
CPU times: user 23min 27s, sys: 1min 31s, total: 24min 58s
Wall time: 27min 27s


# Получение предсказаний для оффлайн инференса

Лучшей моделью по метрикам и скорости оказалась `MultiVAE`.

In [18]:
from recbole.model.general_recommender.multivae import MultiVAE

In [20]:
result = run_recbole(model="MultiVAE", dataset="recbox_data", config_dict=parameter_dict)

command line args [-f /root/.local/share/jupyter/runtime/kernel-448bdf4a-8481-485c-b525-87157a440bc9.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|███████████████████████████| 7/7 [00:00<00:00, 12.64it/s, GPU RAM: 1.25 G/14.75 G]
Train     1: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.43it/s, GPU RAM: 1.25 G/14.75 G]
Train     2: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.02it/s, GPU RAM: 1.25 G/14.75 G]
Train     3: 100%|███████████████████████████| 7/7 [00:00<00:00, 13.39it/s, GPU RAM: 1.25 G/14.75 G]
Train     4: 100%|███████████████████████████| 7/7 [00:00<00:00, 13.64it/s, GPU RAM: 1.25 G/14.75 G]
Train     5: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.39it/s, GPU RAM: 1.25 G/14.75 G]
Train     6: 100%|███████████████████████████| 7/7 [00:00<00:00, 14.52it/s, GPU RAM: 1.25 G/14.75 G]
Train     7: 100%|███████████████████████████| 7/7 [00:00<00:00, 1

In [16]:
result

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.0837),
              ('mrr@10', 0.1657),
              ('ndcg@10', 0.0814),
              ('hit@10', 0.3466),
              ('precision@10', 0.0463)])}

In [19]:
model = MultiVAE(config, dataset=dataset)
checkpoint = torch.load("/content/saved/MultiVAE-Dec-13-2023_06-39-39.pth")
model.load_state_dict(checkpoint["state_dict"])

Max value of user's history interaction records has reached 23.254401942926535% of the total.


<All keys matched successfully>

In [23]:
model.to(config["device"])

MultiVAE(
  (encoder): Sequential(
    (0): Linear(in_features=3294, out_features=600, bias=True)
    (1): Tanh()
    (2): Linear(in_features=600, out_features=128, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=64, out_features=600, bias=True)
    (1): Tanh()
    (2): Linear(in_features=600, out_features=3294, bias=True)
  )
)

In [24]:
def recommend_item(external_user_id, dataset, model):
    if (
        external_user_id in dataset.field2token_id[dataset.uid_field]
        and external_user_id != "[PAD]"
    ):
        model.eval()
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            new_inter = dataset[index]
            new_inter = new_inter.to(config["device"])
            new_scores = model.full_sort_predict(new_inter)
            new_scores = new_scores.view(-1, test_data.dataset.item_num)
            new_scores[:, 0] = -np.inf
            recommended_item_indices = torch.topk(new_scores, 10).indices[0].tolist()
            recos = dataset.id2token(dataset.iid_field, [recommended_item_indices]).tolist()
        return recos
    return []

In [26]:
from tqdm.notebook import tqdm

In [27]:
recos = {}
users = dataset.field2token_id[dataset.uid_field]
for user_id in tqdm(users):
    recos_for_user = recommend_item(user_id, dataset, model)
    if recos_for_user:
        recos[user_id] = recos_for_user[0]

  0%|          | 0/13355 [00:00<?, ?it/s]

In [29]:
with open("/content/drive/MyDrive/recbone.json", "w") as f:
    json.dump(recos, f)