In [1]:
import torch
import random
import matplotlib.pyplot as plt



In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

citeseer_dataset = Planetoid(root = "Citeseer_dataset", name = "Citeseer", transform = NormalizeFeatures())

In [3]:
print(len(citeseer_dataset))
print(citeseer_dataset.num_classes)
print(citeseer_dataset.num_features)
citeseer_graph = citeseer_dataset[0]
citeseer_graph

1
6
3703


Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

In [4]:
citeseer_graph.x

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [5]:
print("Training samples: ", citeseer_graph.train_mask.sum().item())
print("Validation samples: ", citeseer_graph.val_mask.sum().item())
print("Test samples: ", citeseer_graph.test_mask.sum().item())

Training samples:  120
Validation samples:  500
Test samples:  1000


In [6]:
citeseer_graph.y

tensor([3, 1, 5,  ..., 3, 1, 5])

In [7]:
print(f'Number of nodes: {citeseer_graph.num_nodes}')
print(f'Number of edges: {citeseer_graph.num_edges}')
print(f'Average node degree: {citeseer_graph.num_edges / citeseer_graph.num_nodes:.2f}')
print(f'Has isolated nodes: {citeseer_graph.has_isolated_nodes()}')
print(f'Has self-loops: {citeseer_graph.has_self_loops()}')
print(f'Is undirected: {citeseer_graph.is_undirected()}')

Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Has isolated nodes: True
Has self-loops: False
Is undirected: True


We swap out all GCNConv instances with GATConv layers that make use of attention. Building a 2-layer GAT model that makes use of 8 attention heads in the first layer and 1 attention head in the second layer , also using  a dropout ratio of 0.6 inside and outside each GATConv call, and  a hidden_channels dimensions of 8 per head. Document link-
https://towardsdatascience.com/graph-attention-networks-in-python-975736ac5c0c

In [8]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import  GATConv

class GAT(torch.nn.Module):
    def __init__(self, input_channels, output_channels, hidden_channels = 8, num_heads = 8):
        super().__init__()
        torch.manual_seed(123456)
        
        self.gatconv1 = GATConv(
            in_channels = input_channels, 
            out_channels = hidden_channels, 
            heads = num_heads
        )
        
        self.gatconv2 = GATConv(
            in_channels = hidden_channels * num_heads, 
            out_channels = output_channels, 
            heads = 1
        )
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p = 0.6, training = self.training)
        
        x = self.gatconv1(x, edge_index)
        x = F.elu(x)
        
        x = F.dropout(x, p = 0.6, training = self.training)
        x = self.gatconv2(x, edge_index)
        
        return x

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

citeseer_graph = citeseer_dataset[0].to(device)

input_channels = citeseer_dataset.num_features

# hidden_channels = 32
hidden_channels = 8

output_channels = citeseer_dataset.num_classes

In [10]:
model = GAT(
    input_channels = input_channels, 
    output_channels = output_channels
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.005, weight_decay = 5e-4)

criterion = nn.CrossEntropyLoss()

In [11]:
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

Number of parameters:  237586


In [12]:
%%time
num_epochs = 200

for epoch in range(num_epochs):
    model.train()

    optimizer.zero_grad()
    
    out = model(citeseer_graph)
    
    loss = criterion(out[citeseer_graph.train_mask], citeseer_graph.y[citeseer_graph.train_mask])
    loss.backward()
    
    optimizer.step()
    
    # Get predictions on the training data
    pred_train = out.argmax(dim = 1)
    
    correct_train = (
        pred_train[citeseer_graph.train_mask] == citeseer_graph.y[citeseer_graph.train_mask]
    ).sum()
    
    acc_train = int(correct_train) / int(citeseer_graph.train_mask.sum())

    # Print training loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1:03d}, Train Loss: {loss:.3f}, Train Acc: {acc_train:.3f}")

Epoch: 010, Train Loss: 1.700, Train Acc: 0.858
Epoch: 020, Train Loss: 1.559, Train Acc: 0.933
Epoch: 030, Train Loss: 1.403, Train Acc: 0.908
Epoch: 040, Train Loss: 1.218, Train Acc: 0.942
Epoch: 050, Train Loss: 1.033, Train Acc: 0.892
Epoch: 060, Train Loss: 0.860, Train Acc: 0.942
Epoch: 070, Train Loss: 0.701, Train Acc: 0.925
Epoch: 080, Train Loss: 0.614, Train Acc: 0.942
Epoch: 090, Train Loss: 0.574, Train Acc: 0.942
Epoch: 100, Train Loss: 0.471, Train Acc: 0.933
Epoch: 110, Train Loss: 0.419, Train Acc: 0.975
Epoch: 120, Train Loss: 0.356, Train Acc: 0.992
Epoch: 130, Train Loss: 0.365, Train Acc: 0.975
Epoch: 140, Train Loss: 0.378, Train Acc: 0.967
Epoch: 150, Train Loss: 0.352, Train Acc: 0.983
Epoch: 160, Train Loss: 0.336, Train Acc: 0.983
Epoch: 170, Train Loss: 0.299, Train Acc: 0.975
Epoch: 180, Train Loss: 0.343, Train Acc: 0.958
Epoch: 190, Train Loss: 0.307, Train Acc: 0.967
Epoch: 200, Train Loss: 0.284, Train Acc: 0.975
CPU times: total: 56.7 s
Wall time: 13.9

In [13]:
model.eval()

with torch.no_grad():
    pred = model(citeseer_graph).argmax(dim = 1)

    correct = (pred[citeseer_graph.test_mask] == citeseer_graph.y[citeseer_graph.test_mask]).sum()

    test_acc = int(correct) / int(citeseer_graph.test_mask.sum())

print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7050
