## Pytorch-Geometric examples relavent to the ML-for-TN project

Note: torch and torch-geometric (a.k.a. pyg) should be installed separately (NS's experience.)


Messing around with examples

In [67]:
# %automagic

import torch
from torch_geometric.data import Data

In [12]:
'''
Parameters:
data.x: Node feature matrix with shape [num_nodes, num_node_features]
data.edge_index: Graph connectivity in COO format with shape [2, num_edges] and type torch.long
data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
data.y: Target to train against (may have arbitrary shape), e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]
data.pos: Node position matrix with shape [num_nodes, num_dimensions]
'''

edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

# Data(edge_index=[2, 4], x=[3, 1])
display(data)
dir(data)

print("\n** Node features")
print(data.x)
print("\n** Edge indices")
print(data.edge_index)
print("\n** Edge features")
print(data.edge_attr)
print("\n** Target to train against")
print(data.y)
print()
print(type(data))
print(type(data.x))

Data(x=[3, 1], edge_index=[2, 4])


** Node features
tensor([[-1.],
        [ 0.],
        [ 1.]])

** Edge indices
tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

** Edge features
None

** Target to train against
None

<class 'torch_geometric.data.data.Data'>
<class 'torch.Tensor'>


In [22]:
# Validate data (make sure no issues, e.g. edge indices out of bounds)
data.validate(raise_on_error=True)
print("Keys")
print(data.keys())
print("data['x']")
print(data['x'])
for key, item in data:
    print(f'{key} found in data: \n item {item}')

print("Other command results:")
print('edge_attr' in data)
print(data.num_nodes)
print(data.num_edges)
print(data.num_node_features)
print(data.has_isolated_nodes())
print(data.has_self_loops())
print(data.is_directed())

# # Transfer data object to GPU.
# Fails on my laptop, of course. NO GPU. Also, installation did not use GPU
# device = torch.device('cuda')
# data = data.to(device)

# Using CPU instead
device = torch.device('cpu')
data = data.to(device)

Keys
['edge_index', 'x']
data['x']
tensor([[-1.],
        [ 0.],
        [ 1.]])
x found in data: 
 item tensor([[-1.],
        [ 0.],
        [ 1.]])
edge_index found in data: 
 item tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
Other command results:
False
3
4
1
False
False
False


In [33]:
# Take in the ENZYME dataset from the example
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
print(dataset)

print( len(dataset) )
print( dataset.num_classes )  # Classification task
print( dataset.num_node_features )


ENZYMES(600)
600
6
3


In [35]:
# Look at one graph
# 37 nodes. Each node has 3 features.
data = dataset[0]
print(data)
print( data.is_undirected() )
print( data.x )
print( data.y ) # A 1x1 matrix, i.e. 'y' is just a single value
print(data.edge_index)

# Shuffle the dataset
dataset = dataset.shuffle()

# Example of splitting dataset (training/testing)
train_dataset = dataset[:540]
test_dataset = dataset[540:]


Data(edge_index=[2, 168], x=[37, 3], y=[1])
True
tensor([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]])
tensor([5])
tensor([[ 0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
          3,  4,  4,  4,  4,  5,  5,  5,  5

In [40]:
# Now try Cora dataset
from torch_geometric.datasets import Planetoid

In [47]:
# This dataset is just one large graph.
# One learns on it by using "masks" i.e. subsets of the graph.
dataset = Planetoid(root='/tmp/Cora', name='Cora')
# >>> Cora()
len(dataset)
# >>> 1
dataset.num_classes
# >>> 7
dataset.num_node_features
# >>> 1433

# "Here, the dataset contains only a *single*, undirected citation graph:"
print(dataset)
data = dataset[0]
print(data)
# >>> Data(edge_index=[2, 10556], test_mask=[2708],
#          train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])

'''
train_mask denotes against which nodes to train (140 nodes),
val_mask denotes which nodes to use for validation, e.g., to perform early stopping (500 nodes),
test_mask denotes against which nodes to test (1000 nodes).
'''

data.is_undirected()
# >>> True
data.train_mask.sum().item()
# >>> 140
data.val_mask.sum().item()
# >>> 500
data.test_mask.sum().item()
# >>> 1000


Cora()
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


1000

## Mini-batches

In [48]:
'''
torch_geometric.data.Batch inherits from torch_geometric.data.Data and contains an additional attribute called batch

'batch' is a column vector which maps each node to its respective graph in the batch:
'''

from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in loader:
    print(batch)
    # >>> DataBatch(batch=[1082], edge_index=[2, 4066], x=[1082, 21], y=[32])

    print(batch.num_graphs)
    # >>> 32

DataBatch(edge_index=[2, 3816], x=[978, 21], y=[32], batch=[978], ptr=[33])
32
DataBatch(edge_index=[2, 3640], x=[946, 21], y=[32], batch=[946], ptr=[33])
32
DataBatch(edge_index=[2, 4284], x=[1127, 21], y=[32], batch=[1127], ptr=[33])
32
DataBatch(edge_index=[2, 4330], x=[1108, 21], y=[32], batch=[1108], ptr=[33])
32
DataBatch(edge_index=[2, 3720], x=[1005, 21], y=[32], batch=[1005], ptr=[33])
32
DataBatch(edge_index=[2, 3774], x=[970, 21], y=[32], batch=[970], ptr=[33])
32
DataBatch(edge_index=[2, 3558], x=[900, 21], y=[32], batch=[900], ptr=[33])
32
DataBatch(edge_index=[2, 3838], x=[1071, 21], y=[32], batch=[1071], ptr=[33])
32
DataBatch(edge_index=[2, 4270], x=[1072, 21], y=[32], batch=[1072], ptr=[33])
32
DataBatch(edge_index=[2, 3866], x=[996, 21], y=[32], batch=[996], ptr=[33])
32
DataBatch(edge_index=[2, 4374], x=[1149, 21], y=[32], batch=[1149], ptr=[33])
32
DataBatch(edge_index=[2, 3736], x=[1011, 21], y=[32], batch=[1011], ptr=[33])
32
DataBatch(edge_index=[2, 4618], x=[125

In [50]:
from torch_geometric.utils import scatter
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for data in loader:
    print( data )

    print( data.num_graphs )
    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print( x.size() )


DataBatch(edge_index=[2, 4550], x=[1199, 21], y=[32], batch=[1199], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3774], x=[974, 21], y=[32], batch=[974], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3342], x=[881, 21], y=[32], batch=[881], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3892], x=[1025, 21], y=[32], batch=[1025], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3808], x=[982, 21], y=[32], batch=[982], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 4218], x=[1061, 21], y=[32], batch=[1061], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3416], x=[966, 21], y=[32], batch=[966], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 3528], x=[891, 21], y=[32], batch=[891], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 4356], x=[1158, 21], y=[32], batch=[1158], ptr=[33])
32
torch.Size([32, 21])
DataBatch(edge_index=[2, 4204], x=[1161, 21], y=[32], batch=[1161], ptr=[33])
32
torch.Size(

## Data Transforms

In [51]:
# 17,000 3D shape point clouds and per point labels from 16 shape categories
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'])

dataset[0]

Downloading https://shapenet.cs.stanford.edu/media/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Extracting /tmp/ShapeNet/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Processing...
Done!


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [53]:
# We can convert the point cloud dataset into a graph dataset by generating nearest neighbor graphs from the point clouds via transforms:
import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6))

dataset[0]
# Data(edge_index=[2, 15108], pos=[2518, 3], y=[2518])



Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [None]:
# In addition, we can use the transform argument to randomly augment a Data object, e.g., translating each node position by a small number:

import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root='/tmp/ShapeNet', categories=['Airplane'],
                    pre_transform=T.KNNGraph(k=6),
                    transform=T.RandomJitter(0.01))

dataset[0]
>>> Data(edge_index=[2, 15108], pos=[2518, 3], y=[2518])


## Learning Methods on Graphs

"After learning about data handling, datasets, loader and transforms in PyG, it’s time to implement our first graph neural network!"

In [54]:
# We first need to load the Cora dataset:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')
# >>> Cora()

In [56]:
# Now let’s implement a two-layer GCN:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Two GCN convolution layers
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
        

In [57]:
# Run
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)  # Again, this is single graph
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()  # Reset gradients
    out = model(data)
    # Negative log likelihood loss. For training classification problem.
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [58]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8050


In [66]:
torch.set_printoptions(threshold=100)
# Test mask
print("Test mask")
print( data.test_mask )
# Predicted
print("Predicted")
print( pred )
# Actual
print("Actual")
print( data.y )
# 
print( correct )

Test mask
tensor([False, False, False,  ...,  True,  True,  True])
Predicted
tensor([3, 4, 4,  ..., 0, 3, 3])
Actual
tensor([3, 4, 4,  ..., 3, 3, 3])
tensor(805)
