In [1]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.10.0+cu113.html

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu113.html


# 第五课 图上的其他深度学习模型

前面的课程中我们介绍了许多图神经网络模型。除了图神经网络，针对于图数据的深度学习模型还有很多，比如图上的自编码器、变分自编码器、循环神经网络和对抗生成网络等。在这一课中，我们对自编码器和变分自编码器进行代码实践。这其中包括了对模型细节和它们的应用的讲解。

## 0. 链接预测数据集

链接预测（link prediction）是常见的与图有关的任务。该任务旨在预测两个节点之间是否存在链接（link），即是否存在边。

关于链接预测的数据集，我们可以从节点分类任务的数据集直接构造。比如我们之前常用的Cora数据集，就可以无视掉它的节点标签，把Cora图里面的边当成训练/测试数据。下面我们具体来实践一下。

In [2]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
import torch
import torch_geometric

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 构造一个transform，用于对数据的预处理
transform = T.Compose([
    T.NormalizeFeatures(),  # 对特征进行标准化
    T.ToDevice(device),    # 把数据放到cpu或者gpu上
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,  # 这一步很关键，是在构造链接预测的数据集
                      split_labels=True, add_negative_train_samples=False),])


dataset = Planetoid('./', name='Cora', transform=transform)
train_data, val_data, test_data = dataset[0]

下面我们来看一下具体的数据长什么样：
* 我们不需要关注y, train_mask, val_mask, test_mask；这些是节点分类里需要用到的信息。
* pos_edge_label_index是正边样本的索引，pos_edge_label是其标签（全为1）
* neg_edge_label_index是负边样本的索引，neg_edge_label是其标签（全为0）

In [3]:
train_data

Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[4488], pos_edge_label_index=[2, 4488])

In [4]:
train_data.pos_edge_label, train_data.pos_edge_label_index

(tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0'),
 tensor([[1256,  181, 1628,  ...,  484,  306, 2607],
         [2175, 1359, 1139,  ..., 1367,  487, 1003]], device='cuda:0'))

In [5]:
val_data

Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[263], pos_edge_label_index=[2, 263], neg_edge_label=[263], neg_edge_label_index=[2, 263])

In [6]:
val_data.neg_edge_label

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [7]:
test_data 

Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[527], pos_edge_label_index=[2, 527], neg_edge_label=[527], neg_edge_label_index=[2, 527])

值得注意的是：
* train_data中没有自带负边样本neg_edge_label_index，因为我们会在训练过程中自己采样负样本。
* train_data和val_data里面的图是一样的（edge_index是一样的），但是他们的pos_edge_label_index（正边样本）和neg_edge_label_index（负边样本）不一样。可以看到train_data中有4488个正边样本，而val_data中只有263个正边样本（二者比例是85:5）。
* test_data中的图和train_data的图不一样了。可以看到test_data中的edge_index要多一些（多527个），527也就是test_data中的正边样本数量。

## 1. 自编码器

针对于图数据的自编码器我们称之为GAE (Graph AutoEncoder)。其包含两个组成部分，编码器（encoder）和解码器（decoder）。图上的编码器常用的就是GCN了；而解码器呢通常用一个内积来表示。具体地，给定两个节点的节点表示，解码器将计算二者的内积，其结果作为两个节点之间存在边的概率。

In [8]:
from torch_geometric.nn import GCNConv

首先构造编码器，由两层GCN组成。

In [9]:
class GCNEncoder(torch.nn.Module):
    """GCN组成的编码器"""
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

然后构建解码器，将给定的节点对映射到[0，1]之间，以表示边存在的概率。

In [10]:
class InnerProductDecoder(torch.nn.Module):
    """解码器，用向量内积表示重建的图结构"""
    
    def forward(self, z, edge_index, sigmoid=True):
        """
        参数说明：
        z: 节点表示
        edge_index: 边索引，也就是节点对
        """
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)
        return torch.sigmoid(value) if sigmoid else value

In [11]:
class GAE(torch.nn.Module):
    """图自编码器。
    """
    def __init__(self, encoder, decoder=None):
        super().__init__()
        self.encoder = encoder
        self.decoder = InnerProductDecoder()

    def encode(self, *args, **kwargs): 
        """编码功能"""
        return self.encoder(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """解码功能"""
        return self.decoder(*args, **kwargs)

    def recon_loss(self, z, pos_edge_index, neg_edge_index=None):
        """计算正边和负边的二值交叉熵
        
        参数说明
        ----
        z: 编码器的输出
        pos_edge_index: 正边的边索引
        neg_edge_index: 负边的边索引
        """
        EPS = 1e-15 # EPS是一个很小的值，防止取对数的时候出现0值

        pos_loss = -torch.log(
            self.decoder(z, pos_edge_index) + EPS).mean() # 正样本的损失函数

        if neg_edge_index is None:
            neg_edge_index = torch_geometric.utils.negative_sampling(pos_edge_index, z.size(0)) # 负采样
        neg_loss = -torch.log(
            1 - self.decoder(z, neg_edge_index) + EPS).mean() # 负样本的损失函数

        return pos_loss + neg_loss

In [12]:
in_channels, out_channels = dataset.num_features, 16
model = GAE(GCNEncoder(in_channels, out_channels))
model = model.to(device)

In [13]:
latent = model.encode(train_data.x, train_data.edge_index)
latent, latent.shape

(tensor([[-7.1088e-03, -8.9700e-03, -9.6406e-03,  ...,  1.6861e-02,
          -4.9310e-03, -1.5615e-02],
         [ 8.6641e-05, -2.0760e-03, -2.9901e-03,  ...,  5.7016e-04,
          -6.1729e-03, -8.0813e-04],
         [-2.3474e-03, -2.7843e-03, -5.1042e-03,  ...,  1.9495e-03,
          -3.7486e-03, -5.0528e-03],
         ...,
         [ 1.0891e-02, -5.0019e-03, -4.4132e-03,  ..., -3.5658e-03,
          -5.3808e-03,  8.0851e-04],
         [-1.0349e-03, -3.8962e-03, -1.1520e-03,  ..., -3.9173e-03,
          -5.8069e-03, -4.5615e-04],
         [-1.5305e-04, -1.9043e-03, -1.0611e-03,  ..., -4.7798e-03,
          -3.8623e-03,  1.9550e-04]], device='cuda:0', grad_fn=<AddBackward0>),
 torch.Size([2708, 16]))

In [14]:
model.decode(latent, train_data.edge_index)

tensor([0.5001, 0.5000, 0.5000,  ..., 0.5000, 0.5001, 0.5002], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

## 2. 变分自编码器

变分自编码器和自编码器基本结构相同，都是一个编码器加一个解码器。它们的主要区别是，变分自编码器编码后的隐层表示不再是连续的向量表示，而是通过一个高斯分布来表示。具体地，变分自编码器学习的是这个高斯分布的均值（下面用变量`mu`来表示）和标准差（下面用变量`std`来表示）。

In [15]:
MAX_LOGSTD = 10

class VariationalGCNEncoder(torch.nn.Module):

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)
    
class VGAE(GAE): 
    """变分自编码器。继承自GAE这个类，可以使用GAE里面定义的函数。
    """
    
    def __init__(self, encoder, decoder=None):
        super().__init__(encoder, decoder)

    def reparametrize(self, mu, logstd):
        if self.training:
            return mu + torch.randn_like(logstd) * torch.exp(logstd)
        else:
            return mu

    def encode(self, *args, **kwargs):
        """编码功能"""
        self.__mu__, self.__logstd__ = self.encoder(*args, **kwargs) # 编码后的mu和std表示一个分布
        self.__logstd__ = self.__logstd__.clamp(max=MAX_LOGSTD) # 这里把std最大值限制一下
        z = self.reparametrize(self.__mu__, self.__logstd__) # 进行reparametrization，这样才能够训练模型
        return z

    def kl_loss(self, mu=None, logstd=None):
        """我们给隐变量的分布加上（0，I）高斯变量的先验，即希望隐变量分布服从（0，I）的高斯分布
        这两个分布的差别用KL损失来衡量。"""
        mu = self.__mu__ if mu is None else mu
        logstd = self.__logstd__ if logstd is None else logstd.clamp(
            max=MAX_LOGSTD)
        return -0.5 * torch.mean(
            torch.sum(1 + 2 * logstd - mu**2 - logstd.exp()**2, dim=1)) # 两个高斯分布之间的KL损失

（两个高斯分布的kl loss的公式可以参考该[链接](https://stats.stackexchange.com/questions/234757/how-to-use-kullback-leibler-divergence-if-mean-and-standard-deviation-of-of-two)）

In [16]:
model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
model = model.to(device)

In [17]:
latent = model.encode(train_data.x, train_data.edge_index)
latent, latent.shape

(tensor([[-1.8875,  1.0038, -2.1139,  ...,  0.4677,  0.9943,  0.2378],
         [ 0.9955,  0.3355,  0.4304,  ..., -1.5669, -0.4426,  0.4007],
         [ 2.0066, -0.5882, -0.5222,  ...,  2.1833, -0.6039, -2.2213],
         ...,
         [ 0.3537,  0.1473, -0.8227,  ..., -0.5180, -0.4414,  0.6746],
         [-1.1687, -0.9614,  1.2840,  ..., -0.6344,  0.4673,  0.9024],
         [-1.1162,  0.7176,  0.1490,  ..., -0.6302, -1.2232, -2.3135]],
        device='cuda:0', grad_fn=<AddBackward0>), torch.Size([2708, 16]))

In [18]:
model.decode(latent, train_data.edge_index)

tensor([0.0559, 0.0253, 0.8818,  ..., 0.9288, 0.2694, 0.2617], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

## 3. 训练自编码器和变分自编码器

接下来我们展示自编码器和变分自编码器的训练。

In [19]:
def train_gae(model):
    """训练GAE模型"""
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.pos_edge_label_index)
    loss.backward()
    optimizer.step()
    return loss.item()

def train_vgae(model):
    """训练VGAE模型，损失函数由重建损失和kl损失组成"""
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.pos_edge_label_index)
    loss = loss + (1 / train_data.num_nodes) * model.kl_loss() # 加上kl loss
    loss.backward()
    optimizer.step()
    return loss.item()

In [20]:
@torch.no_grad()
def test(model, data):
    """测试模型"""
    from sklearn.metrics import roc_auc_score, average_precision_score
    model.eval()
    pos_edge_index = data.pos_edge_label_index
    neg_edge_index = data.neg_edge_label_index
    
    z = model.encode(data.x, data.edge_index)
    pos_y = z.new_ones(pos_edge_index.size(1)) # 正样本标签
    neg_y = z.new_zeros(neg_edge_index.size(1)) # 负样本标签
    y = torch.cat([pos_y, neg_y], dim=0)

    pos_pred = model.decoder(z, pos_edge_index)
    neg_pred = model.decoder(z, neg_edge_index) 
    pred = torch.cat([pos_pred, neg_pred], dim=0)

    y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

    return roc_auc_score(y, pred), average_precision_score(y, pred) # 计算AUC和AP

训练GAE：

In [21]:
model = GAE(GCNEncoder(in_channels, out_channels))
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 

epochs = 2000
for epoch in range(1, epochs + 1):
    loss = train_gae(model)
    if epoch % 100 == 0:
        auc, ap = test(model, test_data)
        print('Epoch: {:03d}, Loss_train: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))

Epoch: 100, Loss_train: 0.9176, AUC: 0.9163, AP: 0.9254
Epoch: 200, Loss_train: 0.8826, AUC: 0.9223, AP: 0.9331
Epoch: 300, Loss_train: 0.8710, AUC: 0.9176, AP: 0.9296
Epoch: 400, Loss_train: 0.8556, AUC: 0.9257, AP: 0.9375
Epoch: 500, Loss_train: 0.8309, AUC: 0.9329, AP: 0.9449
Epoch: 600, Loss_train: 0.8150, AUC: 0.9398, AP: 0.9496
Epoch: 700, Loss_train: 0.8107, AUC: 0.9348, AP: 0.9466
Epoch: 800, Loss_train: 0.8108, AUC: 0.9346, AP: 0.9476
Epoch: 900, Loss_train: 0.8108, AUC: 0.9342, AP: 0.9492
Epoch: 1000, Loss_train: 0.7851, AUC: 0.9309, AP: 0.9487
Epoch: 1100, Loss_train: 0.8037, AUC: 0.9298, AP: 0.9481
Epoch: 1200, Loss_train: 0.7919, AUC: 0.9283, AP: 0.9470
Epoch: 1300, Loss_train: 0.7869, AUC: 0.9286, AP: 0.9462
Epoch: 1400, Loss_train: 0.7834, AUC: 0.9296, AP: 0.9462
Epoch: 1500, Loss_train: 0.7873, AUC: 0.9300, AP: 0.9477
Epoch: 1600, Loss_train: 0.7865, AUC: 0.9255, AP: 0.9458
Epoch: 1700, Loss_train: 0.7822, AUC: 0.9240, AP: 0.9438
Epoch: 1800, Loss_train: 0.7951, AUC: 0.

训练VGAE：

In [22]:
model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 
epochs = 2000
for epoch in range(1, epochs + 1):
    loss = train_vgae(model)
    if epoch % 100 == 0:
        auc, ap = test(model, test_data)
        print('Epoch: {:03d}, Loss_train: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))

Epoch: 100, Loss_train: 1.1583, AUC: 0.7794, AP: 0.7915
Epoch: 200, Loss_train: 0.9447, AUC: 0.9026, AP: 0.9028
Epoch: 300, Loss_train: 0.8988, AUC: 0.9201, AP: 0.9244
Epoch: 400, Loss_train: 0.8732, AUC: 0.9305, AP: 0.9360
Epoch: 500, Loss_train: 0.8605, AUC: 0.9358, AP: 0.9417
Epoch: 600, Loss_train: 0.8746, AUC: 0.9328, AP: 0.9366
Epoch: 700, Loss_train: 0.8559, AUC: 0.9317, AP: 0.9386
Epoch: 800, Loss_train: 0.8634, AUC: 0.9353, AP: 0.9442
Epoch: 900, Loss_train: 0.8655, AUC: 0.9325, AP: 0.9428
Epoch: 1000, Loss_train: 0.8399, AUC: 0.9357, AP: 0.9457
Epoch: 1100, Loss_train: 0.8517, AUC: 0.9379, AP: 0.9469
Epoch: 1200, Loss_train: 0.8432, AUC: 0.9365, AP: 0.9471
Epoch: 1300, Loss_train: 0.8461, AUC: 0.9329, AP: 0.9450
Epoch: 1400, Loss_train: 0.8452, AUC: 0.9362, AP: 0.9469
Epoch: 1500, Loss_train: 0.8295, AUC: 0.9336, AP: 0.9458
Epoch: 1600, Loss_train: 0.8498, AUC: 0.9340, AP: 0.9456
Epoch: 1700, Loss_train: 0.8414, AUC: 0.9320, AP: 0.9435
Epoch: 1800, Loss_train: 0.8257, AUC: 0.