In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\thesis-recsys


In [2]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, IterableDataset
from torch.utils.tensorboard import SummaryWriter
import torch_geometric
import torch_geometric.transforms as T
from torch_geometric.sampler import NegativeSampling
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_scipy_sparse_matrix

from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

from models.gnn.sage import GraphSAGE
from scripts.train_graph import train_epoch, test

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x2184e9d9a10>

In [3]:
from dataset.kg import RippleDataset, collate_fn
from utils import load_model
from models.kg import RippleNet

In [4]:
dir_art = 'data/steam'
model_path = "runs/RippleNet/2023-11-22_19-02-09/model.pth"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_workers = 0
print(device)

cuda


In [5]:
with open(os.path.join(dir_art, 'knowledge_graph.pkl'), "rb") as f:
    knowledge_graph = pd.read_pickle(f)
with open(os.path.join(dir_art, 'matrix.pkl'), "rb") as f:
    matrix = pd.read_pickle(f)

valid_set = knowledge_graph["valid_set"]
ripple_sets_valid = knowledge_graph['ripple_sets_valid']
RELATIONS_MAP = knowledge_graph["relations_map"]
ENTITY_MAP = knowledge_graph["entity_map"]

train_csr = matrix['train_csr']
valid_csr = matrix['valid_csr']
relevance_mask = np.asarray((valid_csr.sum(axis=1) != 0)).ravel()
valid_csr = valid_csr[relevance_mask]

users = valid_set['user_id'].unique()
items = np.arange(valid_csr.shape[1]) + 1

In [6]:
from torch.nn.utils.rnn import pad_sequence

In [7]:
from time import time

In [8]:
def collate_fn(batch):
    ei, rs = zip(*batch)
    return torch.cat(ei), rs[0]


class IterableRippleDataset(Dataset):
    def __init__(self, users, items, ripple_sets, user_batch_size):
        super(IterableRippleDataset).__init__()
        self.df_ripple_set1 = ripple_sets[0]
        self.df_ripple_set2 = ripple_sets[1]

        self.users = users
        self.items = items
        self.n_users = users.shape[0]
        self.n_items = items.shape[0]
        self.user_batch_size = user_batch_size

    def sample_ripple_set(self, ripple_set, batch_users):
        sample_fun = lambda x: x.sample(n=min(750, x.shape[0])).values

        i = ripple_set.index.isin(batch_users, level='user_id')
        ripple_set_samples = ripple_set[i].groupby('user_id')
        ripple_set_samples = ripple_set_samples.apply(sample_fun)
        ripple_set_samples = np.repeat(ripple_set_samples, 1231).values
        ripple_set_samples = pad_sequence([torch.tensor(i) for i in ripple_set_samples], batch_first=True,
                                          padding_value=0)

        return ripple_set_samples

    def get_batch_data(self, batch):
        u_start, u_end = batch, min(batch + self.user_batch_size, self.n_users)
        batch_users = self.users[u_start:u_end]

        u_id = torch.from_numpy(np.repeat(batch_users, self.n_items))
        i_id = torch.arange(self.n_items).repeat(u_id.shape[0] // self.n_items, 1).flatten() + 1

        ripple_set1 = self.sample_ripple_set(self.df_ripple_set1, batch_users)
        ripple_set2 = self.sample_ripple_set(self.df_ripple_set2, batch_users)

        return torch.column_stack((u_id, i_id)), [ripple_set1, ripple_set2]

    def __len__(self):
        return self.n_users // self.user_batch_size + 1

    # def __iter__(self):
    #     for batch in range(0, self.n_users, self.user_batch_size):
    #         yield self.get_batch_data(batch)

    def __getitem__(self, idx):
        batch = idx * self.user_batch_size
        return self.get_batch_data(batch)

In [16]:
eval_dataset = IterableRippleDataset(users[:103], items, ripple_sets_valid, int(5)) 
eval_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, drop_last=False, num_workers=num_workers)

In [20]:
@torch.no_grad()
def recommend_k_kg(model, dataloader, device, k=10):
    model.eval()
    preds = []
    for edge_index, ripple_sets in tqdm(dataloader):
        edge_index = edge_index.to(device)
        ripple_sets = [rs.to(device) for rs in ripple_sets]
        
        y_pred = model(edge_index, ripple_sets)
        preds.append(y_pred)
    pred = torch.cat(preds, dim=0).sigmoid().cpu()
    return pred

In [21]:
model = load_model(
    RippleNet, 
    "runs/RippleNet/2023-11-22_19-02-09/model.pth", 
    model_kwargs={
        "emb_dim": 16, 
        "n_relations": 4, 
        "n_entities": max(ENTITY_MAP.values())
    },
    device=device
)

In [22]:
prob = recommend_k_kg(model, eval_loader, device)

  0%|          | 0/21 [00:00<?, ?it/s]

In [24]:
prob

torch.Size([126793, 1])

In [25]:
prob_full = prob.reshape(-1, 1231)

In [39]:
from scripts.eval import recommend_k, recommendation_relevance
from metrics import ndcg_k, recall_k, precision_k

In [34]:
recommendations = recommend_k(
    prob_full=prob_full,
    past_interactions=train_csr,
    k=100,
    user_batch_size=10000
)

In [44]:
valid_csr_true = valid_csr
valid_csr = valid_csr[:103]

In [46]:
output_metrics = {"precision": [], "recall": [], "ndcg": []}
for k in tqdm([1, 2, 5, 10, 20, 50, 100]):
    reco_k = recommendations[:, :k]
    reco_rel, rel_mask = recommendation_relevance(reco_k, valid_csr)
    prec_k = precision_k(reco_rel, valid_csr, rel_mask, k)
    rec_k = recall_k(reco_rel, valid_csr, rel_mask, k)
    n_k = ndcg_k(reco_rel.getA(), valid_csr, rel_mask, k)
    output_metrics["precision"].append(prec_k)
    output_metrics["recall"].append(rec_k)
    output_metrics["ndcg"].append(n_k)

  0%|          | 0/7 [00:00<?, ?it/s]