In [1]:
import os.path as osp

import torch
from tqdm import tqdm
import pandas as pd

from torch_geometric.datasets import AmazonBook
from torch_geometric.nn import LightGCN
from torch_geometric.utils import degree

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transactions = pd.read_csv("data/train.csv")
transactions

Unnamed: 0.1,Unnamed: 0,customer_id,article_id,year,month,day
0,479507,232477,28866,20,1,18
1,5328148,306128,7094,20,6,18
2,4779175,41132,12896,20,6,4
3,4852517,189305,34181,20,6,6
4,7130496,238987,18606,20,7,31
...,...,...,...,...,...,...
6997367,500812,186580,26577,20,1,19
6997368,4897929,11831,1305,20,6,8
6997369,3916280,255785,1332,20,5,14
6997370,2789100,64400,16263,20,4,9


In [3]:
# customers, articles = torch.LongTensor(transactions.customer_id), torch.LongTensor(transactions.article_id)
# edge_index = torch.stack((torch.cat([customers, articles]), torch.cat([articles, customers])))
# edge_index.shape
edge_index = torch.LongTensor([transactions.customer_id, transactions.article_id])
edge_index

tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [ 28866,   7094,  12896,  ...,   1332,  16263,  25534]])

In [4]:
from torch_geometric.data import HeteroData
data = HeteroData()

In [5]:
data['customer', 'purchases', 'article'].edge_index = edge_index

In [6]:
data['article', 'purchased_by', 'customer'].edge_index = edge_index[[1,0]]

In [7]:
print(data)

HeteroData(
  (customer, purchases, article)={ edge_index=[2, 6997372] },
  (article, purchased_by, customer)={ edge_index=[2, 6997372] }
)


In [8]:
print(data['customer', 'purchases', 'article'].edge_index)
print(data['article', 'purchased_by', 'customer'].edge_index)

tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [ 28866,   7094,  12896,  ...,   1332,  16263,  25534]])
tensor([[ 28866,   7094,  12896,  ...,   1332,  16263,  25534],
        [232477, 306128,  41132,  ..., 255785,  64400,  91322]])


In [9]:
num_customers = transactions.customer_id.nunique()
num_articles = transactions.article_id.nunique()

In [10]:
data['customer'].num_nodes = num_customers
data['article'].num_nodes = num_articles

In [11]:
print(data)

HeteroData(
  customer={ num_nodes=343166 },
  article={ num_nodes=36806 },
  (customer, purchases, article)={ edge_index=[2, 6997372] },
  (article, purchased_by, customer)={ edge_index=[2, 6997372] }
)


In [12]:
del(transactions)

In [13]:
data = data.to_homogeneous().to(device)
print(data)

Data(edge_index=[2, 13994744], node_type=[379972], edge_type=[13994744])


In [14]:
from torch_geometric.loader import DataLoader
batch_size = 128
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
# train_loader = torch.utils.data.DataLoader(
train_loader = DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

In [32]:
print(data.edge_index.shape)
data.edge_index

torch.Size([2, 13994744])


tensor([[232477, 306128,  41132,  ..., 344498, 359429, 368700],
        [372032, 350260, 356062,  ..., 255785,  64400,  91322]],
       device='cuda:0')

In [30]:
print(train_edge_label_index.shape)
train_edge_label_index

torch.Size([2, 6997372])


tensor([[232477, 306128,  41132,  ..., 255785,  64400,  91322],
        [372032, 350260, 356062,  ..., 344498, 359429, 368700]],
       device='cuda:0')

In [16]:
# import torch_geometric.transforms as T
# data = T.ToSparseTensor(data)
# print(data)

In [17]:
mask

tensor([ True,  True,  True,  ..., False, False, False], device='cuda:0')

In [18]:
mask.sum()

tensor(6997372, device='cuda:0')

In [19]:
data.num_nodes

379972

In [20]:
data.edge_index[0].max()

tensor(379971, device='cuda:0')

In [25]:
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=32,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [33]:
def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        print(f"pos:{pos_edge_label_index.shape}\n", pos_edge_label_index)
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            torch.randint(num_customers, num_customers + num_articles,
                          (index.numel(), ), device=device)
        ], dim=0)
        print(f"neg:{neg_edge_label_index.shape}\n", neg_edge_label_index)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)
        print(f"edge: {edge_label_index.shape}\n", edge_label_index)
        exit()

        optimizer.zero_grad()
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2)

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples

In [34]:
@torch.no_grad()
def test(k: int):
    emb = model.get_embedding(data.edge_index)
    user_emb, book_emb = emb[:num_users], emb[num_users:]

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = start + batch_size
        logits = user_emb[start:end] @ book_emb.t()

        # Exclude training edges:
        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_users] = float('-inf')

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_users] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples

In [35]:
for epoch in range(1, 101):
    loss = train()
    precision, recall = test(k=20)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Precision@20: '
          f'{precision:.4f}, Recall@20: {recall:.4f}')

  0%|                                                 | 0/54667 [00:00<?, ?it/s]

pos: tensor([[267578, 105122, 280048,  82298,  13651, 342911, 173934, 154147,  84566,
         332007, 108656,  89341,   2133, 308653, 161498,  91863, 324956, 131229,
         231066, 142844, 245581,  74492, 263755, 311800, 256827, 209071,  47845,
         231042, 110262, 303504, 229259, 334767, 236353, 270880,  72797,  72169,
         317684, 186707,  81299, 171684, 154825, 175758, 127207, 173379, 172259,
         231141,  33509, 229027, 132632, 200426, 272027,  20397, 282246, 125405,
         303762, 285029,  86414,  84751, 250071, 116366, 341140, 227483, 310501,
         193842,  64260, 245230, 168037, 112871, 114231, 120177,  32958, 204531,
         237212, 123252,  20935,  40078, 156825,  27175, 241362, 315223,  42059,
         208452,  63959, 179433, 187568, 199556,  36593,  65350, 250067, 202184,
         335741,  87259,  52969,  11920,  81852, 156151, 104906, 233301, 294754,
           4920,  81928,  14321, 111019, 294306,  74899,  12469,   7868, 194470,
         195294, 183993




OutOfMemoryError: CUDA out of memory. Tried to allocate 1.67 GiB. GPU 0 has a total capacty of 9.78 GiB of which 1.42 GiB is free. Including non-PyTorch memory, this process has 8.01 GiB memory in use. Of the allocated memory 6.06 GiB is allocated by PyTorch, and 1.31 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

amazon = AmazonBook("./data")
amazon = amazon[0]

print(amazon)

print(data)

2380730
13994744