# Installation

In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

[K     |████████████████████████████████| 7.9 MB 3.6 MB/s 
[K     |████████████████████████████████| 3.5 MB 2.7 MB/s 
[K     |████████████████████████████████| 407 kB 5.0 MB/s 
[K     |████████████████████████████████| 45 kB 3.2 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


# Data Part

In [2]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='PubMed', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('==================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===============================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...



Dataset: PubMed():
Number of graphs: 1
Number of features: 500
Number of classes: 3

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Number of training nodes: 60
Training node label rate: 0.003
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Done!


In [3]:
dataset.num_node_features

500

In [4]:
dataset.num_features

500

In [5]:
data.num_features

500

> Cluster GCN implementation

PyTorch Geometric provides a **two-stage implementation** of the Cluster-GCN algorithm:
1. [**`ClusterData`**](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html#torch_geometric.data.ClusterData) converts a `Data` object into a dataset of subgraphs containing `num_parts` partitions.
2. Given a user-defined `batch_size`, [**`ClusterLoader`**](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html#torch_geometric.data.ClusterLoader) implements the stochastic partitioning scheme in order to create mini-batches.

In [6]:
from torch_geometric.loader import ClusterData, ClusterLoader
torch.manual_seed(12)
cluster_data= ClusterData(data, num_parts= 128)
train_loader= ClusterLoader(cluster_data, batch_size= 32, shuffle= True)

Computing METIS partitioning...
Done!


In [7]:
total_num_nodes = 0
for step, sub_data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')

Step 1:
Number of nodes in the current batch: 4943
Data(x=[4943, 500], y=[4943], train_mask=[4943], val_mask=[4943], test_mask=[4943], edge_index=[2, 17208])

Step 2:
Number of nodes in the current batch: 4936
Data(x=[4936, 500], y=[4936], train_mask=[4936], val_mask=[4936], test_mask=[4936], edge_index=[2, 17894])

Step 3:
Number of nodes in the current batch: 4910
Data(x=[4910, 500], y=[4910], train_mask=[4910], val_mask=[4910], test_mask=[4910], edge_index=[2, 15078])

Step 4:
Number of nodes in the current batch: 4928
Data(x=[4928, 500], y=[4928], train_mask=[4928], val_mask=[4928], test_mask=[4928], edge_index=[2, 17214])

Iterated over 19717 of 19717 nodes!


# Model Part

In [9]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GraphConv(500, 16)
  (conv2): GraphConv(16, 3)
)


In [10]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()

      for sub_data in train_loader:  # Iterate over each mini-batch.
          out = model(sub_data.x, sub_data.edge_index)  # Perform a single forward pass.
          loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask])  # Compute the loss solely based on the training nodes.
          loss.backward()  # Derive gradients.
          optimizer.step()  # Update parameters based on gradients.
          optimizer.zero_grad()  # Clear gradients.

def test():
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      
      accs = []
      for mask in [data.train_mask, data.val_mask, data.test_mask]:
          correct = pred[mask] == data.y[mask]  # Check against ground-truth labels.
          accs.append(int(correct.sum()) / int(mask.sum()))  # Derive ratio of correct predictions.
      return accs

for epoch in range(1, 51):
    loss = train()
    train_acc, val_acc, test_acc = test()
    print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train: 0.6667, Val Acc: 0.6180, Test Acc: 0.6160
Epoch: 002, Train: 0.6500, Val Acc: 0.5900, Test Acc: 0.5640
Epoch: 003, Train: 0.9500, Val Acc: 0.7220, Test Acc: 0.7010
Epoch: 004, Train: 0.9500, Val Acc: 0.7320, Test Acc: 0.7080
Epoch: 005, Train: 0.9667, Val Acc: 0.7300, Test Acc: 0.7070
Epoch: 006, Train: 0.9667, Val Acc: 0.7460, Test Acc: 0.7220
Epoch: 007, Train: 0.9833, Val Acc: 0.7680, Test Acc: 0.7400
Epoch: 008, Train: 0.9833, Val Acc: 0.7600, Test Acc: 0.7400
Epoch: 009, Train: 0.9833, Val Acc: 0.7620, Test Acc: 0.7500
Epoch: 010, Train: 0.9500, Val Acc: 0.7660, Test Acc: 0.7570
Epoch: 011, Train: 0.9500, Val Acc: 0.7700, Test Acc: 0.7560
Epoch: 012, Train: 0.9833, Val Acc: 0.7760, Test Acc: 0.7650
Epoch: 013, Train: 1.0000, Val Acc: 0.7680, Test Acc: 0.7560
Epoch: 014, Train: 1.0000, Val Acc: 0.7640, Test Acc: 0.7360
Epoch: 015, Train: 1.0000, Val Acc: 0.7680, Test Acc: 0.7350
Epoch: 016, Train: 1.0000, Val Acc: 0.7680, Test Acc: 0.7580
Epoch: 017, Train: 1.000