In [1]:
import os.path as osp

import torch
from tqdm import tqdm
import pandas as pd

from torch_geometric.datasets import AmazonBook
from torch_geometric.nn import LightGCN
from torch_geometric.utils import degree

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transactions = pd.read_csv("data/train.csv")
transactions

Unnamed: 0.1,Unnamed: 0,customer_id,article_id,year,month,day
0,479507,232477,28866,20,1,18
1,5328148,306128,7094,20,6,18
2,4779175,41132,12896,20,6,4
3,4852517,189305,34181,20,6,6
4,7130496,238987,18606,20,7,31
...,...,...,...,...,...,...
6997367,500812,186580,26577,20,1,19
6997368,4897929,11831,1305,20,6,8
6997369,3916280,255785,1332,20,5,14
6997370,2789100,64400,16263,20,4,9


In [3]:
# customers, articles = torch.LongTensor(transactions.customer_id), torch.LongTensor(transactions.article_id)
# edge_index = torch.stack((torch.cat([customers, articles]), torch.cat([articles, customers])))
# edge_index.shape
edge_index = torch.LongTensor([transactions.customer_id, transactions.article_id])
edge_index

tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [ 28866,   7094,  12896,  ...,   1332,  16263,  25534]])

In [4]:
test = pd.read_csv("data/test.csv")
edge_label_index = torch.LongTensor([test.customer_id, test.article_id])

In [5]:
from torch_geometric.data import HeteroData
data = HeteroData()

In [6]:
data['customer', 'purchases', 'article'].edge_index = edge_index
data['customer', 'purchases', 'article'].edge_label_index = edge_label_index

In [7]:
data['article', 'purchased_by', 'customer'].edge_index = edge_index[[1,0]]

In [8]:
print(data)

HeteroData(
  (customer, purchases, article)={
    edge_index=[2, 6997372],
    edge_label_index=[2, 1749343],
  },
  (article, purchased_by, customer)={ edge_index=[2, 6997372] }
)


In [9]:
print(data['customer', 'purchases', 'article'].edge_index)
print(data['article', 'purchased_by', 'customer'].edge_index)

tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [ 28866,   7094,  12896,  ...,   1332,  16263,  25534]])
tensor([[ 28866,   7094,  12896,  ...,   1332,  16263,  25534],
        [232477, 306128,  41132,  ..., 255785,  64400,  91322]])


In [10]:
num_customers = transactions.customer_id.nunique()
num_articles = transactions.article_id.nunique()

In [11]:
data['customer'].num_nodes = num_customers
data['article'].num_nodes = num_articles

In [12]:
print(data)

HeteroData(
  customer={ num_nodes=343166 },
  article={ num_nodes=36806 },
  (customer, purchases, article)={
    edge_index=[2, 6997372],
    edge_label_index=[2, 1749343],
  },
  (article, purchased_by, customer)={ edge_index=[2, 6997372] }
)


In [13]:
del(transactions)

In [14]:
data = data.to_homogeneous().to(device)
print(data)

Data(edge_index=[2, 13994744], edge_label_index=[2, 1749343], node_type=[379972], edge_type=[13994744])


In [15]:
from torch_geometric.loader import DataLoader
batch_size = 1024
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
# train_loader = torch.utils.data.DataLoader(
train_loader = DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

In [16]:
print(data.edge_index.shape)
data.edge_index

torch.Size([2, 13994744])


tensor([[232477, 306128,  41132,  ..., 344498, 359429, 368700],
        [372032, 350260, 356062,  ..., 255785,  64400,  91322]],
       device='cuda:0')

In [17]:
print(train_edge_label_index.shape)
train_edge_label_index

torch.Size([2, 6997372])


tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [372032, 350260, 356062,  ..., 344498, 359429, 368700]],
       device='cuda:0')

In [18]:
# import torch_geometric.transforms as T
# data = T.ToSparseTensor(data)
# print(data)

In [19]:
mask

tensor([ True,  True,  True,  ..., False, False, False], device='cuda:0')

In [20]:
mask.sum()

tensor(6997372, device='cuda:0')

In [21]:
data.num_nodes

379972

In [22]:
data.edge_index[0].max()

tensor(379971, device='cuda:0')

In [23]:
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=32,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [24]:
def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        # print(f"pos:{pos_edge_label_index.shape}\n", pos_edge_label_index)
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            torch.randint(num_customers, num_customers + num_articles,
                          (index.numel(), ), device=device)
        ], dim=0)
        # print(f"neg:{neg_edge_label_index.shape}\n", neg_edge_label_index)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)
        # print(f"edge: {edge_label_index.shape}\n", edge_label_index)
        exit()

        optimizer.zero_grad()
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2)

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples

In [25]:
@torch.no_grad()
def test(k: int):
    emb = model.get_embedding(data.edge_index)
    customer_emb, article_emb = emb[:num_customers], emb[num_customers:]

    precision = recall = total_examples = 0
    for start in range(0, num_customers, batch_size):
        end = start + batch_size
        logits = customer_emb[start:end] @ article_emb.t()

        # Exclude training edges:
        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_customers] = float('-inf')

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_customers] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples

In [None]:
for epoch in range(1, 101):
    loss = train()
    precision, recall = test(k=20)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Precision@20: '
          f'{precision:.4f}, Recall@20: {recall:.4f}')

100%|███████████████████████████████████████| 6834/6834 [09:17<00:00, 12.26it/s]


Epoch: 001, Loss: 0.3752, Precision@20: 0.0032, Recall@20: 0.0137


100%|███████████████████████████████████████| 6834/6834 [09:15<00:00, 12.31it/s]


Epoch: 002, Loss: 0.3346, Precision@20: 0.0039, Recall@20: 0.0172


100%|███████████████████████████████████████| 6834/6834 [09:15<00:00, 12.30it/s]


Epoch: 003, Loss: 0.3046, Precision@20: 0.0044, Recall@20: 0.0191


100%|███████████████████████████████████████| 6834/6834 [09:15<00:00, 12.30it/s]


Epoch: 004, Loss: 0.2755, Precision@20: 0.0051, Recall@20: 0.0226


100%|███████████████████████████████████████| 6834/6834 [09:18<00:00, 12.24it/s]


Epoch: 005, Loss: 0.2417, Precision@20: 0.0058, Recall@20: 0.0259


100%|███████████████████████████████████████| 6834/6834 [09:18<00:00, 12.23it/s]


Epoch: 006, Loss: 0.2173, Precision@20: 0.0062, Recall@20: 0.0278


 73%|████████████████████████████▍          | 4987/6834 [06:47<02:30, 12.26it/s]

amazon = AmazonBook("./data")
amazon = amazon[0]

print(amazon)

print(data)

2380730
13994744

amazon['user','rates','book'].edge_label_index