In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\Projects\thesis-recsys


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import torch
from torch.utils.data import DataLoader

from features.store import FeatureStore
from scripts.eval import collate_fn, IterableDatasetTest, FeaturelessIterableDatasetTest
from scripts.eval_knowledge_graph import IterableRippleDataset
from scripts.eval_knowledge_graph import collate_fn as collate_fn_kg
from models import DeepFM, NCF, MF
from models.gnn import GraphSAGE, GATConv, GNN
from models.kg import RippleNet
from utils import load_model

In [121]:
dir_art = "data/steam"
device = 'cuda' if (torch.cuda.is_available()) else 'cpu'

with open(os.path.join(dir_art, 'data.pkl'), "rb") as f:
    data = pd.read_pickle(f)
with open(os.path.join(dir_art, 'graph.pkl'), "rb") as f:
    graph = pd.read_pickle(f)
with open(os.path.join(dir_art, 'knowledge_graph.pkl'), "rb") as f:
    knowledge_graph = pd.read_pickle(f)

item_attr = data['items_datastore'].dataframe.df
user_attr = data['users_datastore'].dataframe.df
scheme_relations = data['relations_datastore'].scheme
scheme_items = data['items_datastore'].scheme
scheme_users = data['users_datastore'].scheme

train_data = graph['train_data']
valid_data = graph['valid_data']

user_shape = train_data['user'].x.shape
app_shape = train_data['app'].x.shape

valid_set = knowledge_graph["valid_set"]
ripple_sets_valid = knowledge_graph['ripple_sets_valid']
RELATIONS_MAP = knowledge_graph["relations_map"]
ENTITY_MAP = knowledge_graph["entity_map"]

In [122]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))

In [123]:
# DeepFM (complex)

model_path = "runs/DeepFM/2024-01-11_20-07-25/model.pth"
log_dir = os.path.dirname(model_path)
model_name = os.path.basename(os.path.dirname(log_dir))

feature_store = FeatureStore(scheme_relations, scheme_items, scheme_users,
                                     emb_dims={"sparse": 16, "varlen": 16})
model_cls = DeepFM
model_kwargs = {
    "feature_store": feature_store,
    "hidden_dim": [128, 64],
    "device": device
}
eval_dataset = IterableDatasetTest(feature_store, user_attr, item_attr, user_batch_size=int(1))
eval_loader = DataLoader(eval_dataset, shuffle=False, batch_size=1, collate_fn=collate_fn, drop_last=False)
model = load_model(
    cls=model_cls,
    model_path=model_path,
    model_kwargs=model_kwargs,
    device=device
)
model.eval()

batch_user = next(iter(eval_loader)).to(device)

In [124]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = model(batch_user).sigmoid()

2.92 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [11]:
# DeepFM

model_path = "runs/DeepFM/2024-01-11_20-32-34/model.pth"
log_dir = os.path.dirname(model_path)
model_name = os.path.basename(os.path.dirname(log_dir))

feature_store = FeatureStore(scheme_relations, scheme_items, scheme_users,
                                     emb_dims={"sparse": 4, "varlen": 4})
model_cls = DeepFM
model_kwargs = {
    "feature_store": feature_store,
    "hidden_dim": [64, 16],
    "device": device
}
eval_dataset = IterableDatasetTest(feature_store, user_attr, item_attr, user_batch_size=int(1))
eval_loader = DataLoader(eval_dataset, shuffle=False, batch_size=1, collate_fn=collate_fn, drop_last=False)
model = load_model(
    cls=model_cls,
    model_path=model_path,
    model_kwargs=model_kwargs,
    device=device
)
model.eval()

batch_user = next(iter(eval_loader)).to(device)

In [126]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = model(batch_user).sigmoid()

1.8 ms ± 40.8 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [127]:
# MF

model_path = "runs/MF/2024-01-07_22-57-42/model.pth"
log_dir = os.path.dirname(model_path)
model_name = os.path.basename(os.path.dirname(log_dir))

scheme_items.features, scheme_users.features = [], []
feature_store = FeatureStore(scheme_relations, scheme_items, scheme_users,
                             emb_dims={"sparse": 16, "varlen": 16})
model_cls = MF
model_kwargs = {
    "feature_store": feature_store,
    "device": device
}
eval_dataset = FeaturelessIterableDatasetTest(user_attr.shape[0], item_attr.shape[0], user_batch_size=int(1))
eval_loader = DataLoader(eval_dataset, shuffle=False, batch_size=1, collate_fn=collate_fn, drop_last=False)
model = load_model(
    cls=model_cls,
    model_path=model_path,
    model_kwargs=model_kwargs,
    device=device
)
model.eval()

batch_user = next(iter(eval_loader)).to(device)

In [128]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = model(batch_user).sigmoid()

306 µs ± 7.89 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [24]:
# GraphSAGE

model_path = "runs/GraphSAGE/2024-01-11_20-48-39/model.pth"
gnn_model = GraphSAGE(hidden_channels=32, out_channels=32)
model = load_model(
    cls=GNN,
    model_path=model_path,
    model_kwargs={
        "gnn_model": gnn_model,
        "entities_shapes": {"user": user_shape, "app": app_shape},
        "hidden_channels": 32,
        "metadata": train_data.metadata()
    },
    device=device
)
model_emb_matrix = model.evaluate(valid_data.to(device))

user_emb = model_emb_matrix['user'][0].cpu().numpy()
items_emb = model_emb_matrix['app'].transpose(0, 1).cpu().numpy()

In [96]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = sigmoid(np.dot(user_emb, items_emb))

139 µs ± 32.6 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [27]:
# GATConv

model_path = "runs/GATConv/2024-01-11_21-05-06/model.pth"
gnn_model = GATConv(hidden_channels=32, out_channels=32)
model = load_model(
    cls=GNN,
    model_path=model_path,
    model_kwargs={
        "gnn_model": gnn_model,
        "entities_shapes": {"user": user_shape, "app": app_shape},
        "hidden_channels": 32,
        "metadata": train_data.metadata()
    },
    device=device
)
model_emb_matrix = model.evaluate(valid_data.to(device))

user_emb = model_emb_matrix['user'][0].cpu().numpy()
items_emb = model_emb_matrix['app'].transpose(0, 1).cpu().numpy()

In [119]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = sigmoid(np.dot(user_emb, items_emb))

140 µs ± 39 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [34]:
# RippleNet
model_path = "runs/RippleNet/2024-01-11_21-44-36/model.pth"
model_cls = RippleNet
model_kwargs = {
    "emb_dim": 16,
    "n_relations": 4,
    "n_entities": max(ENTITY_MAP.values())
}
users = valid_set['user_id'].unique()
items = np.arange(1231) + 1
eval_dataset = IterableRippleDataset(users, items, ripple_sets_valid, int(1))
eval_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_kg, drop_last=False, num_workers=1)
model = load_model(
    cls=model_cls,
    model_path=model_path,
    model_kwargs=model_kwargs,
    device=device
)

edge_index, ripple_sets = next(iter(eval_loader))
edge_index = edge_index.to(device)
ripple_sets = [rs.to(device) for rs in ripple_sets]

In [94]:
%%timeit -n 20 -r 7
with torch.no_grad():
    output = model(edge_index, ripple_sets).sigmoid()

The slowest run took 6.18 times longer than the fastest. This could mean that an intermediate result is being cached.
3.62 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)
