# 第三次作业

本次作业我们加强对图神经网络的实践，包括GAT和GraphSAGE。具体地，我们需要（1）实现DGL中的GATConv，（2）利用DGL中的采样（Sampling）功能来部分完成GraphSAGE。

## 1. GAT 代码填空

GAT里的聚合过程可以表示为

$$\mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
\sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j}$$

其中节点i和节点j之间的注意力分数计算如下。注意分母里的节点是包含节点$i$自身的。

$$\alpha_{i,j} =
\frac{
\exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
[\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
\right)\right)}
{\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
\exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
[\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
\right)\right)}.$$

下面的代码并不遵循DGL的GATConv实现，而是按照上面公式实现的版本。DGL版本的GATConv使用了更加高效和更节省内存的实现，同时也更复杂和不容易理解，具体请参考链接：https://docs.dgl.ai/_modules/dgl/nn/pytorch/conv/gatconv.html#GATConv

*注意：*在DGL里对于边上的softmax计算有专门的函数[dgl.nn.functional.edge_softmax](https://docs.dgl.ai/generated/dgl.nn.functional.edge_softmax.html#dgl.nn.functional.edge_softmax)。在完成作业的时候，需要使用`edge_softmax`实现对每个节点按边进行softmax计算$\alpha_{i,j}$。

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from dgl import function as fn
from dgl.nn.functional import edge_softmax


class GATConv(nn.Module):
    """
    参数说明
    ------   
    in_feats: 输入神经元的数量
    out_feats: 输出神经元的数量
    num_heads: 注意力机制head的数量
    attn_drop: 对注意力分数的dropout概率
    negative_slope: LeakyReLU中(-∞,0)的部分的斜率
    activation: 激活函数
    bias: 偏置项
    """
    
    def __init__(self,
                 in_feats,
                 out_feats,
                 num_heads,
                 attn_drop=0.,
                 negative_slope=0.2,
                 activation=None,
                 bias=True):
        super(GATConv, self).__init__()
        self.in_feats=in_feats
        self.num_heads = num_heads
        self.out_feats = out_feats
        self.fc = nn.Linear(self.in_feats, out_feats * num_heads, bias=False)

        # 注意力机制中的参数
        self.attn = nn.Parameter(torch.FloatTensor(size=(1, num_heads, 2 * out_feats)))  # concatinate后维度会变为2倍

        self.attn_drop = nn.Dropout(attn_drop)
        self.leaky_relu = nn.LeakyReLU(negative_slope)
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(size=(num_heads * out_feats,)))
        else:
            self.register_buffer('bias', None)

        self.activation = activation

        self.reset_parameters()
        
    def reset_parameters(self):
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn, gain=gain)
        if self.bias is not None:
            nn.init.constant_(self.bias, 0)

    def forward(self, graph, feat):

        # 对原始输入特征进行变换处理
        feat_head = self.fc(feat).view(*feat.shape[:-1], self.num_heads, self.out_feats)
        
        #################

        # 1. 把转换后的特征赋到点上。通过自定义lambda方法把源和邻居节点的特征concat起来，存到边上
        #### 代码填空 ####
                
        # 2. 取出边上的特征，进行多头attention转换，并做LeakyRelu计算
        #### 代码填空 ####

        # 3. 使用DGL的edg_softmax函数完成按边的softmax计算
        #### 代码填空 ####
        
        # 4. 对softmax值做dropout后，赋给边
        #### 代码填空 ####
        
        # 5. 用softmax值和feat_head的特征，通过消息函数和聚合函数完成GAT的核心计算
        #### 代码填空 ####
        
        # 6. 从节点特征里获取GAT的计算结果
        #### 代码填空 ####
        
        ################
            
        # 使用bias项
        if self.bias is not None:
            output = output + self.bias.view(*((1,) * len(feat.shape[:-1])), self.num_heads, self.out_feats)
        
        # 使用activation函数
        if self.activation:
            output = self.activation(output)

        return output

Using backend: pytorch


In [2]:
class GAT(torch.nn.Module):
    """ 2层GAT.
    
    参数说明
    ----------
    nfeat : 输入特征的维度
    nhid : 隐藏神经元的数量
    nclass : 输出神经元的数量，也即类别的数量
    heads: 注意力机制中的head数量
    attn_drop: 对注意力分数的dropout概率
    activation: 使用的激活函数
    with_bias: 是否带有偏置项
    """

    def __init__(self, 
                 nfeat, 
                 nhid, 
                 nclass, 
                 heads=8, 
                 attn_drop=0.5, 
                 activation=F.elu, # 按照原论文的设置，我们使用ELu作为激活函数
                 with_bias=True):

        super(GAT, self).__init__()

        self.conv1 = GATConv(
            in_feats=nfeat,
            out_feats=nhid,
            num_heads=heads,
            attn_drop=attn_drop, 
            activation=activation,
            bias=with_bias)

        self.conv2 = GATConv(
            in_feats=nhid * heads,
            out_feats=nclass,
            num_heads=nclass,
            attn_drop=attn_drop, 
            activation=activation,
            bias=with_bias)

    def forward(self, g, features):
        h = self.conv1(g, features).flatten(1)   # flatten的目的是把多头attention的输出变换成一个头
        h = self.conv2(g, h).flatten(1)
        return F.log_softmax(h, dim=1)

    def initialize(self):
        """初始化GAT的参数.
        """
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

In [3]:
def train(model, g, lr=0.01, weight_decay=5e-4, epochs=200):
    """训练模型"""
    #################
    #### 代码填空 ####
    ################

@torch.no_grad()
def test(model, g):
    """测试模型在测试集上的性能"""
    #################
    #### 代码填空 ####
    ################

In [4]:
import dgl
from dgl.data import CoraGraphDataset

dataset = CoraGraphDataset('./data') # 将数据保存在data文件夹下

g = dataset[0]
nclass = g.ndata['label'].max().item() + 1

# 为了配合GAT的算法，给图上的节点添加自环的边
g = dgl.remove_self_loop(g)
g = dgl.add_self_loop(g)

gat = GAT(nfeat=g.ndata['feat'].shape[1],
      nhid=8, heads=8, nclass=nclass)
train(gat, g, epochs=100)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Epoch 0, training loss: 3.891892671585083
Epoch 10, training loss: 2.5167133808135986
Epoch 20, training loss: 1.8516827821731567
Epoch 30, training loss: 1.4740606546401978
Epoch 40, training loss: 1.3435750007629395
Epoch 50, training loss: 1.0301473140716553
Epoch 60, training loss: 0.9806718826293945
Epoch 70, training loss: 0.7711170315742493
Epoch 80, training loss: 0.6751642227172852
Epoch 90, training loss: 0.7746800184249878


In [5]:
preds, output, acc = test(gat, g)

Test set results: loss= 0.8297 accuracy= 0.8170


## 2. GraphSage 代码填空

GraphSAGE的核心部分是采样（sampling）。我们利用DGL提供的MultiLayerNeighborSampler来实现采样功能，并使用DGL提供的NodeDataLoader来完成迷你批次样本的构建。

*注意：*MultiLayerNeighborSampler的入参fanouts表示的是对于每一阶邻居的采样的数量。例如，fanouts=\[10,5\]表示从一阶邻居里随机地采样10个邻居，再从这10个邻居的邻居(二阶邻居)里，随机采样5个邻居。更多的含义可以查看[DGL的MultiLayerNeighborSampler文档](https://docs.dgl.ai/api/python/dgl.dataloading.html#neighbor-sampler)。

这部分的代码填空很少，主要是希望同学们能够通过例子来学会使用MultiLayerNeighborSampler。关于使用DGL进行大图迷你批次训练的内容，可以查看DGL[《用户指南》](https://docs.dgl.ai/guide_cn/index.html)里的[第6章：在大图上的随机（批次）训练](https://docs.dgl.ai/guide_cn/minibatch.html)的内容

另外需要注意的是GraphSAGE中的聚合方式，它有两个变换矩阵：

$$ \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
$$

In [6]:
import dgl
from dgl.data import CoraGraphDataset
from dgl.dataloading import MultiLayerNeighborSampler
from dgl.dataloading import NodeDataLoader

dataset = CoraGraphDataset('./data') # 将数据保存在data文件夹下

g = dataset[0]

train_idx = torch.arange(g.num_nodes())[g.ndata['train_mask']]

sampler = MultiLayerNeighborSampler(fanouts=[10,5])
train_loader = NodeDataLoader(g, train_idx,
                              sampler, batch_size=128,
                              shuffle=True, num_workers=0)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [7]:
from dgl.nn import SAGEConv

class GraphSAGE(nn.Module):
    """ 2层GraphSAGE
    
    参数说明
    ----------
    nfeat : 输入特征的维度
    nhid : 隐藏神经元的数量
    nclass : 输出神经元的数量，也即类别的数量
    dropout : dropout中的概率
    with_bias: 是否带有偏置项
    """

    def __init__(self, nfeat, nhid, nclass, dropout=0.5, with_bias=True):
        super(GraphSAGE, self).__init__()
        self.convs = nn.ModuleList()
        # 这里我们使用平均`mean`来聚合邻居的特征，DGL的SAGEConv还有其他的聚合方法选择，比如`pool`,`lstm`。
        self.convs.append(SAGEConv(nfeat, nhid, aggregator_type='mean', bias=with_bias, activation=F.relu))
        self.convs.append(SAGEConv(nhid, nclass, aggregator_type='mean', bias=with_bias))
        
        self.droput = dropout
        
    def reset_parameters(self):
        """初始化模型参数"""
        for conv in self.convs:
            conv.reset_parameters()
            conv.reset_parameters()

    def forward(self, blocks, features):
        
        h = self.convs[0](blocks[0], features)
        h = F.dropout(h, p=self.droput, training=self.training)
        h = self.convs[1](blocks[1], h)

        return h.log_softmax(dim=-1)       
            
    def inference(self, g, features):
        """模型测试阶段的前向传播，不采样邻居节点，直接使用所有的邻居。可参考GCN中forward()的实现"""
        #################
        #### 代码填空 ####
        ################

In [8]:
def train(model, g, train_loader, epochs, device='cpu', lr=0.01, weight_decay=5e-4):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    features = g.ndata['feat'].to(device)
    labels = g.ndata['label'].to(device)
    
    for it in range(epochs):
        model.train()

        total_loss = 0
        for input_nodes, seeds, mfgs in train_loader:
            # 'input_nodes' 是采样子图里所有节点的ID
            # 'seeds'是采样的种子节点ID，也是需要预测和计算loss的节点
            # 'mfgs':采样后的多层子图
            batch_inputs = features[input_nodes]  # 获取子图所有节点的特征
            batch_labels = labels[seeds]            # 获取种子节点对应的标签
            
            mfgs = [mfg.to(device) for mfg in mfgs]

            out = model(mfgs, batch_inputs)
            loss = F.nll_loss(out, batch_labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        loss = total_loss / len(train_loader)
        if it % 10 ==0:
            print('Epoch:', it, 'training loss:', total_loss)


In [9]:
@torch.no_grad()
def test(model):
    """Evaluate GNN performance on test set.
    """
    model.eval() # eval()把dropout的概率设置为0（不使用dropout）
    test_mask = g.ndata['test_mask']
    labels = g.ndata['label'] 
    features = g.ndata['feat']
    output = model.inference(g, features) # 得到模型输出
    loss_test = F.nll_loss(output[test_mask], labels[test_mask])
    preds = output[test_mask].argmax(1) # 得到预测值
    acc_test = preds.eq(labels[test_mask]).cpu().numpy().mean() # 得到准确率
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))
    return preds, output, acc_test.item()

In [10]:
print(nclass)

sage = GraphSAGE(nfeat=g.ndata['feat'].shape[1], nhid=16, nclass=nclass)
train(sage, g, train_loader, epochs=100, device='cpu')

7
Epoch: 0 training loss: 3.8636229038238525
Epoch: 10 training loss: 2.7999221086502075
Epoch: 20 training loss: 1.5960562229156494
Epoch: 30 training loss: 0.9994849264621735
Epoch: 40 training loss: 0.8969895541667938
Epoch: 50 training loss: 0.6504308879375458
Epoch: 60 training loss: 0.5231107324361801
Epoch: 70 training loss: 0.3678404539823532
Epoch: 80 training loss: 0.36875565350055695
Epoch: 90 training loss: 0.3202539086341858


In [11]:
pred, output, acc_test = test(sage)
acc_test

Test set results: loss= 0.7128 accuracy= 0.8020


0.802