In [1]:
import torch
from ogb.nodeproppred import NodePropPredDataset
from torch_geometric.data import DataLoader
from torch_geometric.utils import scatter
# Temporarily set weights_only to False when loading
original_torch_load = torch.load

def torch_load_with_weights_only_false(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_torch_load(*args, **kwargs)

torch.load = torch_load_with_weights_only_false

# Load the dataset
dataset = NodePropPredDataset(name="ogbn-proteins", root="dataset/")

# Restore torch.load to its original function
torch.load = original_torch_load

split_idx = dataset.get_idx_split()  # which ones are training nodes, val, etc

print("Dataset loaded successfully!")

Dataset loaded successfully!


graph_data: 
- edge_index: COO format (coordinate format) 
    - shape: (2, 79122504)
    - 79122504 edges
    - 0: source node, 1: destination node
- edge_feat:
    - shape: (79122504, 8)
    - 8 dimensional feature vector for each edge
- node_feat:
    - Node features: none for this dataset
- node_species:
    - shape: (132534, 1)
    - Species identifiers for each node
- num_nodes:
    - 132534 nodes

node_labels:
- shape: (132534, 112)
- 112 types of labels

In [2]:
from torch_geometric.utils import scatter
from torch_geometric.loader import RandomNodeLoader
from torch_geometric.data import Data

# Access the single graph
graph_data, node_labels = dataset[0]

# Split the nodes based on the split indices
split_idx = dataset.get_idx_split()  # which ones are training, val, test nodes
train_idx = split_idx["train"]
valid_idx = split_idx["valid"]
test_idx = split_idx["test"]

# Create masks
train_mask = torch.zeros(graph_data['num_nodes'], dtype=torch.bool)
valid_mask = torch.zeros(graph_data['num_nodes'], dtype=torch.bool)
test_mask = torch.zeros(graph_data['num_nodes'], dtype=torch.bool)

train_mask[train_idx] = True
valid_mask[valid_idx] = True
test_mask[test_idx] = True

# Convert edge features and edge index to tensors
graph_data['edge_feat'] = torch.tensor(graph_data['edge_feat'], dtype=torch.float32)
graph_data['edge_index'] = torch.tensor(graph_data['edge_index'], dtype=torch.long)

# initialize node features by aggregating edge features
graph_data['node_feat'] = scatter(graph_data['edge_feat'], 
                                  graph_data['edge_index'][1],
                                  dim=0,
                                  dim_size=graph_data['num_nodes'],
                                  reduce='sum')

# load graph into PyG data object
data = Data(
    edge_index=graph_data['edge_index'],
    edge_attr=graph_data['edge_feat'],
    num_nodes=graph_data['num_nodes'],
    y=node_labels,
    x=graph_data['node_feat'],
    train_mask=train_mask,
    valid_mask=valid_mask,
    test_mask=test_mask
)


train_loader = RandomNodeLoader(data, num_parts=40, shuffle=True,
                                num_workers=5)
test_loader = RandomNodeLoader(data, num_parts=5, num_workers=5)

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
import torch
from ogb.nodeproppred import Evaluator, PygNodePropPredDataset
from torch.nn import LayerNorm, Linear, ReLU
from tqdm import tqdm
from torch_geometric.loader import RandomNodeLoader
from torch_geometric.nn import DeepGCNLayer, GENConv
from torch_geometric.utils import scatter

class DeeperGCN(torch.nn.Module):
    def __init__(self, in_channels, edge_in_channels, hidden_channels, num_classes, num_layers):
        super().__init__()
        torch.manual_seed(12345)

        self.node_encoder = Linear(in_channels, hidden_channels)    
        self.edge_encoder = Linear(edge_in_channels, hidden_channels)
        self.layers = torch.nn.ModuleList()

        for i in range(0, num_layers):
            # num_layers is layers in MLP
            conv = GENConv(hidden_channels, hidden_channels, aggr='softmax',
                           t=1.0, learn_t=True, num_layers=2, norm='layer')
            norm = LayerNorm(hidden_channels, elementwise_affine=True)
            act = ReLU(inplace=True)
            layer = DeepGCNLayer(conv, norm, act, block='res+', dropout=0.1)
            self.layers.append(layer)

        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, edge_feat):
        # encode node and edge features to higher dimensional space
        h = self.node_encoder(x)
        edge_embeddings = self.edge_encoder(edge_feat)

        # pass through layers
        for layer in self.layers:
            h = layer(h, edge_index, edge_embeddings)
  
        # drop out and classifier
        h = F.dropout(h, p=0.1, training=self.training)
        h = self.lin(h)
        
        return h
    

def train(model, optimizer, criterion, train_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.train()
    total_loss = total_examples = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = batch.to(device)
        out = model(batch.x, batch.edge_index, batch.edge_attr)
        loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * int(data.train_mask.sum())
        total_examples += int(data.train_mask.sum())

    return total_loss / total_examples

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DeeperGCN(in_channels=8, edge_in_channels=8, hidden_channels=64, num_classes=112, num_layers=3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()
evaluator = Evaluator('ogbn-proteins')


In [7]:
loss = train(model, optimizer, criterion, train_loader)

RuntimeError: [enforce fail at alloc_cpu.cpp:115] data. DefaultCPUAllocator: not enough memory: you tried to allocate 20255361024 bytes.

In [15]:
y_true = {'train': [], 'valid': [], 'test': []}
y_pred = {'train': [], 'valid': [], 'test': []}

for data in test_loader:
    out = model(data.x, data.edge_index, data.edge_attr)

    for split in y_true.keys():
        mask = data[f'{split}_mask']
        y_true[split].append(data.y[mask].cpu())
        y_pred[split].append(out[mask].cpu())

train_rocauc = evaluator.eval({
        'y_true': torch.cat(y_true['train'], dim=0),
        'y_pred': torch.cat(y_pred['train'], dim=0),
    })['rocauc']

valid_rocauc = evaluator.eval({
    'y_true': torch.cat(y_true['valid'], dim=0),
    'y_pred': torch.cat(y_pred['valid'], dim=0),
})['rocauc']

test_rocauc = evaluator.eval({
    'y_true': torch.cat(y_true['test'], dim=0),
    'y_pred': torch.cat(y_pred['test'], dim=0),
})['rocauc']

TypeError: DataLoader found invalid type: '<class 'numpy.int64'>'

In [4]:
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import networkx as nx
import matplotlib.pyplot as plt
import torch

def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()

graph_data, node_labels = dataset[0]

# Convert to PyTorch tensors
edge_index = torch.tensor(graph_data['edge_index'], dtype=torch.long)
edge_attr = torch.tensor(graph_data['edge_feat'], dtype=torch.float)
node_species = torch.tensor(graph_data['node_species'], dtype=torch.long)
node_labels = torch.tensor(node_labels, dtype=torch.float)

# Create PyG Data object
pyg_data = Data(edge_index=edge_index, edge_attr=edge_attr, y=node_labels, num_nodes=graph_data['num_nodes'])

# # data = dataset[0]
# G = to_networkx(pyg_data, to_undirected=True)
# visualize_graph(G, color=pyg_data.y)