In [1]:
## install torch-geometric in colab
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.9.0+cu111.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.9.0+cu111.html
Looking in links: https://data.pyg.org/whl/torch-1.9.0+cu111.html


# 第三次作业

本次作业我们加强对图神经网络的实践，包括GAT和GraphSAGE。具体地，我们需要（1）实现PyG中的GATConv，（2）利用PyG中的采样（Sampling）功能来完成GraphSAGE。

## 1. GAT 代码填空

GAT里的聚合过程可以表示为

$$\mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
\sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j}$$

其中节点i和节点j之间的注意力分数为

$$\alpha_{i,j} =
\frac{
\exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
[\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
\right)\right)}
{\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
\exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
[\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
\right)\right)}.$$

下面的代码改编自PyG某一版本的GATConv实现。我做了诸多简化来让它简单易读且适应于当前版本。

注：这里我们没有用最新版PyG中GATConv的实现，因为最新版本的GATConv不是特别好懂。

实际上我们参考的是1.3.2版本的GATConv，见该链接：https://github.com/pyg-team/pytorch_geometric/blob/881d5ba2aefc26328eeeaa17fd7ef6daaae06ef4/torch_geometric/nn/conv/gat_conv.py

In [2]:
import torch
from torch import Tensor
import torch.nn.functional as F
from torch.nn import Parameter
from torch_sparse import SparseTensor, set_diag
from torch_geometric.nn.dense.linear import Linear
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax
from torch_geometric.nn.inits import glorot, zeros
import torch.optim as optim

class GATConv(MessagePassing):
    """
    参数说明
    ------   
    in_channels: 输入神经元的数量
    out_channels: 输出神经元的数量
    heads: 注意力机制head的数量
    concat: 如果concat是True，那么最后的输出就是拼接每个head的输出；如果concat是False，
        那么最后的输出就是对每个head的输出求平均.
    negative_slope: LeakyReLU中(-∞,0)的部分的斜率。
    dropout: 对注意力分数的dropout概率。
    bias: 偏置项
    """

    def __init__(self, in_channels,
                 out_channels, heads=1, concat=True,
                 negative_slope=0.2, dropout=0.0,
                 bias=True, **kwargs):
        
        kwargs.setdefault('aggr', 'add')
        super(GATConv, self).__init__(node_dim=0, **kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.heads = heads
        self.concat = concat
        self.negative_slope = negative_slope
        self.dropout = dropout

        self.lin = Linear(in_channels, heads * out_channels,
                              bias=False, weight_initializer='glorot')

        # 注意力机制中的参数
        self.att = Parameter(torch.Tensor(1, heads, 2*out_channels))

        if bias and concat:
            self.bias = Parameter(torch.Tensor(heads * out_channels))
        elif bias and not concat:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)


        self.reset_parameters()

    def reset_parameters(self):
        """初始化参数"""
        self.lin.reset_parameters()
        glorot(self.att)
        zeros(self.bias)
    
    def forward(self, x, edge_index, size=None):
        """前向传播"""
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        x = self.lin(x)
        ## x[N, self.heads * self.out_channels]
        output = self.propagate(edge_index, size=size, x=x) # 得到聚合信息后的节点特征
        ## output[N, self.heads, self.out_channels]
        
        if self.concat is True:
            #################
            #### 代码填空 ####
            ################
            output = output.view(-1, self.heads * self.out_channels)
        else:
            #################
            #### 代码填空 ####
            ################
            output = output.mean(dim=1)

        if self.bias is not None:
            output = output + self.bias
        return output

    def message(self, edge_index_i, x_i, x_j, size_i):
        """计算注意力分数。
    
        参数说明
        ----
        edge_index_i: 边的序号的第一维，对应x_i的邻居节点
        x_i: source节点的节点特征
        x_j: target节点的节点特征
        size_i: source节点的节点数量
        """
        
        ###############################################
        #### 代码填空，计算softmax之前的注意力分数alpha ####
        ###############################################

        x_j = x_j.view(-1, self.heads, self.out_channels)
        x_i = x_i.view(-1, self.heads, self.out_channels)

        alpha = (torch.cat([x_i, x_j], dim=-1) * self.att).sum(dim=-1)

        alpha = F.leaky_relu(alpha, self.negative_slope)
        alpha = softmax(src=alpha, index=edge_index_i, num_nodes=size_i)
        
        # 对注意力分数alpha进行dropout
        alpha = F.dropout(alpha, p=self.dropout, training=self.training)
        
        ##################################
        #### 代码填空，完成需要返回的变量 ####
        #################################

        ## from node j to node i
        return x_j * alpha.view(-1, self.heads, 1)

In [3]:
class GAT(torch.nn.Module):
    """ 2层GAT.
    
    参数说明
    ----------
    nfeat : 输入特征的维度
    nhid : 隐藏神经元的数量
    nclass : 输出神经元的数量，也即类别的数量
    heads: 注意力机制中的head数量
    output_heads: 输出层的head数量
    dropout : dropout中的概率
    with_bias: 是否带有偏置项
    """

    def __init__(self, nfeat, nhid, nclass, heads=8, output_heads=1, dropout=0.5, with_bias=True):

        super(GAT, self).__init__()

        self.conv1 = GATConv(
            nfeat,
            nhid,
            heads=heads,
            dropout=dropout,
            bias=with_bias)

        self.conv2 = GATConv(
            nhid * heads,
            nclass,
            heads=output_heads,
            concat=False,
            dropout=dropout,
            bias=with_bias)

        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index)) # 按照原论文的设置，我们使用ELu作为激活函数
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

    def initialize(self):
        """初始化GAT的参数.
        """
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

In [4]:
def train(model, data, lr=0.01, weight_decay=5e-4, epochs=200):
    """训练模型"""
    #################
    #### 代码填空 ####
    ################

    ## copy from sample codes
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    labels = data.y
    train_mask = data.train_mask

    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(data)

        loss = F.nll_loss(output[train_mask], labels[train_mask]) # 用训练集中的节点来计算损失函数
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print('Epoch {}, training loss: {}'.format(i, loss.item()))

@torch.no_grad()
def test(model, data):
    """测试模型在测试集上的性能"""
    #################
    #### 代码填空 ####
    ################

    ## copy from sample codes
    model.eval()
    test_mask = data.test_mask
    labels = data.y 
    output = model(data) # 得到模型输出
    loss_test = F.nll_loss(output[test_mask], labels[test_mask])
    preds = output[test_mask].argmax(1) # 得到预测值
    acc_test = preds.eq(labels[test_mask]).cpu().numpy().mean() # 得到准确率
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))
    return preds, output, acc_test.item()

In [5]:
from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='./data', name='Cora') # 将数据保存在data文件夹下
data = dataset[0]
nclass = data.y.max().item()+1
gat = GAT(nfeat=data.x.shape[1],
      nhid=8, heads=8, nclass=nclass)
train(gat, data, epochs=100)

Epoch 0, training loss: 1.9829869270324707
Epoch 10, training loss: 0.3864467442035675
Epoch 20, training loss: 0.39203497767448425
Epoch 30, training loss: 0.21190983057022095
Epoch 40, training loss: 0.24609829485416412
Epoch 50, training loss: 0.21547135710716248
Epoch 60, training loss: 0.1505132019519806
Epoch 70, training loss: 0.18310952186584473
Epoch 80, training loss: 0.18345752358436584
Epoch 90, training loss: 0.17470145225524902


In [6]:
preds, output, acc = test(gat, data)

Test set results: loss= 0.7024 accuracy= 0.7970


## 2. GraphSAGE 代码填空

GraphSAGE的核心部分是采样（sampling）。我们利用PyG提供的NeighborSampler来实现采样功能。

这部分的代码填空很少，主要是希望同学们能够通过例子来学会使用NeighborSampler。

另外需要注意的是GraphSAGE中的聚合方式，它有两个变换矩阵：

$$ \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
$$

In [7]:
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import NeighborSampler
dataset = Planetoid(root='./data', name='Cora') # 将数据保存在data文件夹下
data = dataset[0]
nclass = data.y.max().item()+1

sizes=[10,5] # 表示第一层采样10个邻居，第二层采样5个邻居
train_idx = torch.arange(data.num_nodes)[data.train_mask]
train_loader = NeighborSampler(data.edge_index, node_idx=train_idx,
                               sizes=sizes, batch_size=128,
                               shuffle=True, num_workers=0) 

In [8]:
from torch_geometric.nn import SAGEConv

class GraphSAGE(torch.nn.Module):
    """ 2层GraphSAGE
    
    参数说明
    ----------
    nfeat : 输入特征的维度
    nhid : 隐藏神经元的数量
    nclass : 输出神经元的数量，也即类别的数量
    dropout : dropout中的概率
    with_bias: 是否带有偏置项
    """

    def __init__(self, nfeat, nhid, nclass, dropout=0.5, with_bias=True):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(nfeat, nhid, bias=with_bias))
        self.convs.append(SAGEConv(nhid, nclass, bias=with_bias))
        self.dropout = dropout
        
    def reset_parameters(self):
        """初始化模型参数"""
        for conv in self.convs:
            conv.reset_parameters()
            conv.reset_parameters()

    def forward(self, x, adjs):
        """对应于neighborsampler的前向传播"""
        num_layers = len(adjs)
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # x_target是目标节点，最后一层的目标节点就是带标签的节点
            x = self.convs[i]((x, x_target), 
                              edge_index) # x是邻居节点的特征，x_target是目标节点的特征，它们对应不同的特征变换矩阵
            if i != num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)
        return x.log_softmax(dim=-1)       
            
    def inference(self, data):
        """模型测试阶段的前向传播，不采样邻居节点，直接使用所有的邻居。可参考GCN中forward()的实现"""
        #################
        #### 代码填空 ####
        ################

        x, edge_index = data.x, data.edge_index

        x = F.relu(self.convs[0](x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.convs[1](x, edge_index))

        return x.log_softmax(dim=-1) 

In [9]:
def train(model, train_loader, epochs, device='cpu', lr=0.01, weight_decay=5e-4):
    """训练阶段，这部分我们就不再设置填空作业，而是选择让同学们自行理解"""
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    x = data.x.to(device)
    y = data.y.squeeze().to(device)
    
    for it in range(epochs):
        model.train()

        total_loss = 0
        for batch_size, n_id, adjs in train_loader:
            # `n_id`是被采样的节点（包含了有标签的节点，和无标签的邻居节点）
            # `adjs`里面对应了每一层采样的邻接矩阵，包含了`(edge_index, e_id, size)`
            adjs = [adj.to(device) for adj in adjs]

            optimizer.zero_grad()
            out = model(x[n_id], adjs)
            loss = F.nll_loss(out, y[n_id[:batch_size]]) # n_id[:batch_size]表示的采样的带标签的节点
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        loss = total_loss / len(train_loader)
        if it % 10 ==0:
            print('Epoch:', it, 'training loss:', total_loss)

In [10]:
@torch.no_grad()
def test(model):
    """测试模型在测试集上的性能"""
    model.eval() # eval()把dropout的概率设置为0（不使用dropout）
    test_mask = data.test_mask
    labels = data.y 
    output = model.inference(data) # 得到模型输出
    loss_test = F.nll_loss(output[test_mask], labels[test_mask])
    preds = output[test_mask].argmax(1) # 得到预测值
    acc_test = preds.eq(labels[test_mask]).cpu().numpy().mean() # 得到准确率
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))
    return preds, output, acc_test.item()

In [11]:
sage = GraphSAGE(nfeat=data.x.shape[1], nhid=16, nclass=nclass)
train(sage, train_loader, epochs=100, device='cpu')

Epoch: 0 training loss: 3.8576111793518066
Epoch: 10 training loss: 0.44551892578601837
Epoch: 20 training loss: 0.167935810983181
Epoch: 30 training loss: 0.09276534151285887
Epoch: 40 training loss: 0.06609238497912884
Epoch: 50 training loss: 0.02603654097765684
Epoch: 60 training loss: 0.04685967415571213
Epoch: 70 training loss: 0.04303382057696581
Epoch: 80 training loss: 0.09111708216369152
Epoch: 90 training loss: 0.11578271351754665


In [12]:
pred, output, acc_test = test(sage)
acc_test

Test set results: loss= 0.9304 accuracy= 0.7510


0.751