# 1. Imports

In [378]:
import importlib

In [379]:
import torch
import torch.nn.functional as F

In [380]:
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score

In [381]:
from torch_geometric.loader import LinkNeighborLoader
import torch_geometric.transforms as T

In [382]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cpu


# Main

In [383]:
import json
with open('datasets/info.json', 'r') as f:
    datasets_info = json.load(f)

In [384]:
import model.data
import model.plm
importlib.reload(model.data)
importlib.reload(model.plm)
from model.data import get_data_frames, create_graph, create_hetero_graph, create_data_loaders
from model.plm import load_plm_model, get_plm_embeddings, get_items_features

In [385]:
num_user_features = 32
data_name = 'cit_en'
plm_name = 'enbert'

In [386]:
with open('datasets/info.json', 'r') as f:
        datasets_info = json.load(f)

users_csv = datasets_info[data_name]['users_csv']
items_csv = datasets_info[data_name]['items_csv']
user_naming = datasets_info[data_name]['user_naming']
item_naming = datasets_info[data_name]['item_naming']
feats_file = datasets_info[data_name]['plm'][plm_name]
epochs = datasets_info[data_name]['epochs']

df_users, df_items = get_data_frames(users_csv, items_csv, user_naming, item_naming)
edges_coo = create_graph(df_users, df_items, user_naming, item_naming)
item_feature_tensor = torch.load(feats_file, map_location=device)
user_features_init = 'zero'
data = create_hetero_graph(edges_coo=edges_coo,
                            user_features_init=user_features_init,
                            user_feature_size=num_user_features,
                            item_feature_tensor=item_feature_tensor)

--
users shape: (26206, 4)
unique (users, articles): (1644, 2154)
contentId                      -3499919498720038879
personId                       -8845298781299428018
url          http://techcrunch.com/2016/06/07/hiri/
lang                                             en
Name: 0, dtype: object
--
items shape: (2154, 4)
unique articles: 2154
contentId                                 -6451309518266745024
title        Ethereum, a Virtual Currency, Enables Transact...
url          http://www.nytimes.com/2016/03/28/business/dea...
lang                                                        en
Name: 0, dtype: object
HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={ edge_index=[2, 26206] },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 26206] }
)
user num_nodes: 1644
user num_features: 32
item num_nodes: 2154
item num_features: 768
num_edges user->item: 26206
num_edges item-

In [387]:
feature_tensor = torch.load(feats_file, map_location=device)
feature_tensor.shape

torch.Size([2154, 768])

In [388]:
# plm_model_name = 'ptbert'
# plm_model, plm_tokenizer = load_plm_model(plm_model_name, device)

In [389]:
# title_naming = 'title'
# feat_tensor = get_plm_embeddings(plm_model, plm_tokenizer, df_items, title_naming, device)
# print('Output feat_tensor shape:', feat_tensor.shape)

# 4. Data Loaders

In [390]:
neg_sampling_ratio = 1.0        # hyper: 1, 2
batch_size = 128
num_neighbors = [10, 5]         # hyper: [10, 5], [10, 10], [20, 10], [20, 20]

#### Reload

In [392]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
        num_val=0.0,
        num_test=0.9,
        disjoint_train_ratio=0.3,
        is_undirected=True,                             # new
        neg_sampling_ratio=neg_sampling_ratio,          # negative sampling for val and test
        add_negative_train_samples=False,               # negative samples for train generated on-the-fly
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user")
    )

train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)
print()
print("Test data:")
print("================")
print(test_data)

Training data:
HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 1835],
    edge_label=[786],
    edge_label_index=[2, 786]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 1835] }
)

Validation data:
HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 2621],
    edge_label=[0],
    edge_label_index=[2, 0]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 2621] }
)

Test data:
HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 2621],
    edge_label=[47170],
    edge_label_index=[2, 47170]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 2621] }
)


In [357]:
print(test_data)
test_data['user', 'rates', 'item'].edge_label.unique()

HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 23586],
    edge_label=[5240],
    edge_label_index=[2, 5240]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 23586] }
)


tensor([0., 1.])

In [344]:
print(val_data)
val_data['user', 'rates', 'item'].edge_label.unique()

HeteroData(
  [1muser[0m={
    node_id=[1644],
    x=[1644, 32]
  },
  [1mitem[0m={
    node_id=[2154],
    x=[2154, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 23586],
    edge_label=[5240],
    edge_label_index=[2, 5240]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 23586] }
)


tensor([0., 1.])

In [345]:
print('Train data:')
print('=============')
print('present num_edges user->item:', train_data["user", "rates", "item"].num_edges)
print('to be predicted positive num_edges user->item:', train_data["user", "rates", "item"].edge_label_index.shape[1])
print('to be predicted edge classes:', torch.unique(train_data["user", "rates", "item"].edge_label))
print('negative edge classes [0.] would be generated during training on-the-fly')

Train data:
present num_edges user->item: 16511
to be predicted positive num_edges user->item: 7075
to be predicted edge classes: tensor([1.])
negative edge classes [0.] would be generated during training on-the-fly


In [346]:
# train data doesn't have negative edges yet
# valid data already has negative edges

In [347]:
# edge_index        - edges from src nodes (edge_index[0][i]) 
#                   - to dst nodes (edge_index[1][i])
# edge_label        - contains labels of edges, 1 - positive, 0 - negative
# edge_label_index  - same as edge_index, but contains only those edge indices
#                   - on which we have labels, e.g. node234 -> node432: label=1

In [348]:
# Training data:
# edge_index=[2, 1040140],      - number of edges to construct graph, 70% of edges for message passing,
#                               - edges already present in the graph
# edge_label=[445773],          - number of edges for training, 30% of edges for supervision.
#                               - labels of missing edges, [0, 1] - 0 for negative edge, 1 for positive
# edge_label_index=[2, 445773]  - edges that are absent in the graph for training 

So, we see that all the nodes are present both in training and validation data!

As well as all the item_node features.

Only the edge_index changes: which users connected to which items. Some of them are removed for train and val. 

We are now ready to create a mini-batch loader that will generate subgraphs that can be used as input into our GNN. While this step is not strictly necessary for small-scale graphs, it is absolutely necessary to apply GNNs on larger graphs that do not fit onto GPU memory otherwise. Here, we make use of the loader.LinkNeighborLoader which samples multiple hops from both ends of a link and creates a subgraph from it. Here, edge_label_index serves as the "seed links" to start sampling from.

In [349]:
print(val_data["user", "rates", "item"].edge_label.unique())

tensor([0., 1.])


In [358]:
# In the first hop, we sample at most 10 neighbors.
# In the second hop, we sample at most 5 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
# Define seed edges:
edge_label_index = train_data["user", "rates", "item"].edge_label_index
edge_label = train_data["user", "rates", "item"].edge_label

train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=num_neighbors,
        neg_sampling_ratio=neg_sampling_ratio,          # generate train neg samples on the fly
        subgraph_type="bidirectional",
        edge_label_index=(("user", "rates", "item"), train_data["user", "rates", "item"].edge_label_index),
        edge_label=train_data["user", "rates", "item"].edge_label,
        batch_size=batch_size,
        shuffle=True                                 # shuffle=True
    )

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled Train mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size # 2 edges (1 pos + 1 neg) * batch_size
assert len(sampled_data["user", "rates", "item"].edge_label.unique()) == 2
assert sampled_data["user", "rates", "item"].edge_label.min() == 0
assert sampled_data["user", "rates", "item"].edge_label.max() == 1

Sampled Train mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[958],
    x=[958, 32],
    n_id=[958]
  },
  [1mitem[0m={
    node_id=[1296],
    x=[1296, 768],
    n_id=[1296]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 6475],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[128]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 6475] }
)


In [351]:
sampled_data['user', 'rates', 'item'].edge_label.unique()

tensor([0., 1.])

In [352]:
any(sampled_data['user', 'rates', 'item'].edge_index[0] != sampled_data['item', 'rev_rates', 'user'].edge_index[1])

False

(user, rates, item).edge_index != (item, rev_rates, user).edge_index

Here, we make use of the loader.LinkNeighborLoader which samples multiple hops from both ends of a link and creates a subgraph from it.

In [373]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "item"].edge_label_index
edge_label = val_data["user", "rates", "item"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=num_neighbors,
    neg_sampling_ratio=0.0,                                 # already generated for val
    subgraph_type="bidirectional",
    edge_label_index=(("user", "rates", "item"), val_data["user", "rates", "item"].edge_label_index),
    edge_label=val_data["user", "rates", "item"].edge_label,
    batch_size=int((1 + neg_sampling_ratio) * batch_size),  # to account for already sampled neg edges
    shuffle=False                                           # (1 pos + n neg) * batch_size
)

sampled_data = next(iter(val_loader))

print("Sampled Validation mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

Sampled Validation mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[964],
    x=[964, 32],
    n_id=[964]
  },
  [1mitem[0m={
    node_id=[1295],
    x=[1295, 768],
    n_id=[1295]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 7894],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 7894] }
)


In [374]:
for batch_data in tqdm.tqdm(val_loader):
    if len(batch_data['user', 'rates', 'item'].edge_label.unique()) == 2:
        elabel = batch_data['user', 'rates', 'item'].edge_label
        eindex = batch_data['user', 'rates', 'item'].edge_label_index
        print(batch_data)
        print(elabel)
        
        newei = torch.stack([eindex[0][elabel==1], eindex[1][elabel==1]], dim=0)
        print(newei.shape)
        break

 48%|████▊     | 10/21 [00:00<00:00, 122.01it/s]

HeteroData(
  [1muser[0m={
    node_id=[1025],
    x=[1025, 32],
    n_id=[1025]
  },
  [1mitem[0m={
    node_id=[1287],
    x=[1287, 768],
    n_id=[1287]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 7065],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 7065] }
)
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,




In [375]:
# get indices where it is 1


In [376]:
sampled_data["user", "rates", "item"].edge_label.unique()

tensor([1.])

In [377]:
# Define the test seed edges:
edge_label_index = test_data["user", "rates", "item"].edge_label_index
edge_label = test_data["user", "rates", "item"].edge_label

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=num_neighbors,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=int((neg_sampling_ratio + 1) * batch_size),
    shuffle=False,
)

# sampled_data = next(iter(test_loader))

# print("Sampled Test mini-batch:")
# print("===================")
# print(sampled_data)

# assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
# assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
# assert sampled_data["user", "rates", "item"].edge_label.max() <= 1