In [None]:
import random
import numpy as np

import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import SGConv

In [1]:
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

In [2]:
device = f'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

dataset = PygNodePropPredDataset(name='ogbn-products',
                                 transform=T.ToSparseTensor())
data = dataset[0]

# data comes with train-val-test splits predefined for comparison
split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

In [3]:
data

Data(adj_t=[2449029, 2449029, nnz=123718280], x=[2449029, 100], y=[2449029, 1])

In [13]:
split_idx

{'train': tensor([     0,      1,      2,  ..., 196612, 196613, 196614]),
 'valid': tensor([196615, 196616, 196617,  ..., 235935, 235936, 235937]),
 'test': tensor([ 235938,  235939,  235940,  ..., 2449026, 2449027, 2449028])}

In [4]:
dir(data)

['__apply__',
 '__call__',
 '__cat_dim__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__inc__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__num_nodes__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'adj_t',
 'apply',
 'clone',
 'coalesce',
 'contains_isolated_nodes',
 'contains_self_loops',
 'contiguous',
 'cpu',
 'cuda',
 'debug',
 'edge_attr',
 'edge_index',
 'face',
 'from_dict',
 'is_coalesced',
 'is_directed',
 'is_undirected',
 'keys',
 'normal',
 'num_edge_features',
 'num_edges',
 'num_faces',
 'num_features',
 'num_node_features',
 'num_nodes',
 'pin_memory',
 'pos',
 'to',
 'to_dict',
 'to_namedtuple',
 'x',
 'y']

### Exploring dataset

In [43]:
print(f' dataset has {data.num_nodes} nodes where each node has a {data.num_node_features} dim feature vector')
print(f' dataset has {data.num_edges} edges where each edge has a {data.num_edge_features} dim feature vector')
print(f' dataset has {dataset.num_classes} classes')


 dataset has 2449029 nodes where each node has a 100 dim feature vector
 dataset has 123718280 edges where each edge has a 0 dim feature vector
 dataset has 47 classes


In [10]:
data.x.shape

torch.Size([2449029, 100])

In [21]:
data.adj_t

SparseTensor(row=tensor([      0,       0,       0,  ..., 2449028, 2449028, 2449028]),
             col=tensor([    384,    2412,    7554,  ..., 1787657, 1864057, 2430488]),
             size=(2449029, 2449029), nnz=123718280, density=0.00%)

In [46]:
split_idx['train'].shape, split_idx['valid'].shape, split_idx['test'].shape

(torch.Size([196615]), torch.Size([39323]), torch.Size([2213091]))

### Creating the model

In [67]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        super().__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.conv3 = SAGEConv(hidden_dim, out_dim)
    
    def forward(self, data):
        x = self.conv1(data.x, data.adj_t)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)
        
        x = self.conv2(x, data.adj_t)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)
        
        x = self.conv3(x, data.adj_t)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)
        return torch.log_softmax(x, dim=-1)

### Training & Evaluation

In [70]:
def train(model, data, train_idx, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(data)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc

In [72]:
lr = 1e-4 
epochs = 50 
hidden_dim = 75
evaluator = Evaluator(name='ogbn-products')

model = GraphSAGE(in_dim=data.num_node_features, 
                 hidden_dim=hidden_dim, 
                 out_dim=dataset.num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(1, 1 + epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)
    #logger.add_result(run, result)

    if epoch % 10 == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch}/{epochs}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')

Epoch: 10/50, Loss: 3.8133, Train: 3.71%, Valid: 3.44% Test: 3.21%
Epoch: 20/50, Loss: 3.7366, Train: 8.86%, Valid: 8.51% Test: 6.50%
Epoch: 30/50, Loss: 3.6572, Train: 16.06%, Valid: 15.33% Test: 11.35%
Epoch: 40/50, Loss: 3.5739, Train: 24.13%, Valid: 23.08% Test: 16.64%
Epoch: 50/50, Loss: 3.4854, Train: 31.80%, Valid: 30.76% Test: 22.00%
