value means srcID, dstID

In [98]:
import dgl
import torch as th
import h5py
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pygraphviz as pgv
import torch
import torch.nn as nn
import torch.nn.functional as F

In [99]:
# Create a heterograph with 3 node types and 3 edges types.
graph_data = {
    ('drug', 'interacts', 'drug'): (th.tensor([0, 1]), th.tensor([1, 0])),
    ('drug', 'interacts', 'gene'): (th.tensor([0, 1]), th.tensor([2, 3])),
    ('drug', 'treats', 'disease'): (th.tensor([1, 2]), th.tensor([1, 0]))
}
g = dgl.heterograph(graph_data)
g

Graph(num_nodes={'disease': 2, 'drug': 3, 'gene': 4},
      num_edges={('drug', 'interacts', 'drug'): 2, ('drug', 'interacts', 'gene'): 2, ('drug', 'treats', 'disease'): 2},
      metagraph=[('drug', 'drug'), ('drug', 'gene'), ('drug', 'disease')])

#### Build the graph

In [100]:
f1 = h5py.File('./mesh_path.h5','r')
f2 = h5py.File('./path_reac.h5','r')
f3 = h5py.File('./reac_reac.h5','r')

In [101]:
src1 = f1['src']
src2 = f2['src']
src3 = f3['src']
dis1 = f1['dis']
dis2 = f2['dis']
dis3 = f3['dis']

In [124]:
graph_data = {
    ('meshDescriptor', 'hasEvent', 'pathway'): (th.tensor(src1), th.tensor(dis1)),
    ('pathway', 'hasEvent', 'reaction'): (th.tensor(src2), th.tensor(dis2)),
    ('reaction', 'precedingEvent', 'reaction'): (th.tensor(src3), th.tensor(dis3))
}
G = dgl.heterograph(graph_data)
G

Graph(num_nodes={'meshDescriptor': 121, 'pathway': 209, 'reaction': 1277},
      num_edges={('meshDescriptor', 'hasEvent', 'pathway'): 345, ('pathway', 'hasEvent', 'reaction'): 1289, ('reaction', 'precedingEvent', 'reaction'): 955},
      metagraph=[('meshDescriptor', 'pathway'), ('pathway', 'reaction'), ('reaction', 'reaction')])

In [127]:
#G.nodes('pathway')

In [110]:
print('Node types:', G.ntypes)
print('Edge types:', G.etypes)
print('Canonical edge types:', G.canonical_etypes)

Node types: ['meshDescriptor', 'pathway', 'reaction']
Edge types: ['hasEvent', 'hasEvent', 'precedingEvent']
Canonical edge types: [('meshDescriptor', 'hasEvent', 'pathway'), ('pathway', 'hasEvent', 'reaction'), ('reaction', 'precedingEvent', 'reaction')]


In [111]:
f4 = h5py.File('./label_path.h5','r')
label_pathway = f4['label_pathway']
label_pathway

<HDF5 dataset "label_pathway": shape (209,), type "<i8">

In [112]:
label_pathway = torch.tensor(label_pathway).long()
len(label_pathway)
labels = label_pathway
labels

tensor([  0,   0, 117,   0, 117,  58,   0, 102,  14, 117, 117,   1,   2,  10,
          2,  39,  40,  14,  74, 110,  26, 107,  14,  75,  11,  19,  73,  59,
          8,  11,  62,   9,  62,  97,   9,   9,   9,   9,  12,  11,  28,  45,
         11,  26,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  45,  26,
         11,  11,  24,  11,  11,  45,  75,  11,  26,  86,  73,  11,  75,  11,
         86,  14, 109,  14,  39, 100,  14,  58,  14,  14, 104,  19,  19,  31,
        119,  21,  25,  25,  26,  26,  26,  26,  45,  28,  40,  28,  29,  33,
         33,  33,  33,  75,  38,  39,  40, 102,  40,  40,  40,  63,  95,  41,
         41,  43,  93,  97,  97,  49,  67,  53,  54,  89,  89,  57, 120,  58,
         67,  58,  58, 103,  58,  58, 102,  69,  58,  60,  60,  62,  62,  62,
         63,  65,  65,  65,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,
         67,  67,  67,  67,  75,  75,  75,  75,  75,  75,  75,  75,  75,  75,
         75,  75,  75,  75,  75,  75,  75,  75,  75,  75,  75,  

In [106]:
pid = list(range(209))
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:180]).long()
val_idx = torch.tensor(shuffle[160:200]).long()
test_idx = torch.tensor(shuffle[200:]).long()

In [107]:
import dgl.function as fn
class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })
    def forward(self, G, feat_dict):
        #input输入是每一种节点特征的字典 
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # 计算 W_r * h
            Wh = self.weight[etype](feat_dict[srctype])
            # 将其存入图中以便于信息传递
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            # 指定每个关系的消息传递函数：（message_func，reduce_func）。
            # 请注意，结果将保存到相同的目标特征“ h”，这暗示了聚合的类型明智的约简。
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # 触发多种类型的消息传递。
        # 第一个参数是每个关系的消息传递函数（message passing functions）
        # 第二个是类型明智的reduce functions，可以是“ sum”，“ max”，“ min”，“ mean”，“ stack”
        G.multi_update_all(funcs, 'sum')
        # 返回更新的节点特征（以字典形式表示）
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

In [108]:
class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # 使用可训练的节点嵌入作为无特征输入。
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # 创建神经网络层
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)
    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        #获取文章预测结果
        return h_dict['paper']

#### Tutorial examples

In [113]:
import scipy.io
import urllib.request

#data_url = 'https://data.dgl.ai/dataset/ACM.mat'
#data_file_path = '/tmp/ACM.mat'

#urllib.request.urlretrieve(data_url, data_file_path)
data = scipy.io.loadmat('./ACM.mat')
print(list(data.keys()))


['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']


In [114]:
#print(data['PvsA'])

In [115]:
G = dgl.heterograph({
        ('paper', 'written-by', 'author') : data['PvsA'],
        ('author', 'writing', 'paper') : data['PvsA'].transpose(),
        ('paper', 'citing', 'paper') : data['PvsP'],
        ('paper', 'cited', 'paper') : data['PvsP'].transpose(),
        ('paper', 'is-about', 'subject') : data['PvsL'],
        ('subject', 'has', 'paper') : data['PvsL'].transpose(),
    })
print(G)

Graph(num_nodes={'author': 17431, 'paper': 12499, 'subject': 73},
      num_edges={('paper', 'written-by', 'author'): 37055, ('author', 'writing', 'paper'): 37055, ('paper', 'citing', 'paper'): 30789, ('paper', 'cited', 'paper'): 30789, ('paper', 'is-about', 'subject'): 12499, ('subject', 'has', 'paper'): 12499},
      metagraph=[('author', 'paper'), ('paper', 'author'), ('paper', 'paper'), ('paper', 'paper'), ('paper', 'subject'), ('subject', 'paper')])


In [46]:
# 使用 graphviz创建元图.
def plot_graph(nxg):
    ag = pgv.AGraph(strict=False, directed=True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    ag.layout('dot')
    ag.draw('graph.png')
plot_graph(g.metagraph)

In [119]:
pvc = data['PvsC'].tocsr()
# 找到所有在 KDD, ICML, VLDB三个会议论文上发表的文章
c_selected = [0, 11, 13]  # 三个数字分别代表KDD, ICML, VLDB
p_selected = pvc[:, c_selected].tocoo() # 选出在

# 生成标记
labels = pvc.indices
labels[labels == 11] = 1
labels[labels == 13] = 2
labels = torch.tensor(labels).long()

# 拆分产生训练集、验证集和测试集
pid = p_selected.row
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:800]).long()
val_idx = torch.tensor(shuffle[800:900]).long()
test_idx = torch.tensor(shuffle[900:]).long()

pid

array([    0,     1,     2, ..., 10805, 10806, 12420], dtype=int32)

In [85]:
# 创建模型. 模型有三个输出，分别对应任务中的三个会议
model = HeteroRGCN(G, 10, 10, 3)
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
best_val_acc = 0
best_test_acc = 0
for epoch in range(100):
    logits = model(G)
    # 仅针对标记节点计算损失。
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    pred = logits.argmax(1)
    train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
    val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
    test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 5 == 0:
        print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))

Loss 1.2874, Train Acc 0.2663, Val Acc 0.3000 (Best 0.3000), Test Acc 0.2797 (Best 0.2797)
Loss 1.0460, Train Acc 0.3837, Val Acc 0.3300 (Best 0.3300), Test Acc 0.3317 (Best 0.3317)
Loss 0.8836, Train Acc 0.5688, Val Acc 0.4800 (Best 0.5100), Test Acc 0.5360 (Best 0.5226)
Loss 0.7587, Train Acc 0.6175, Val Acc 0.4700 (Best 0.5100), Test Acc 0.5042 (Best 0.5226)
Loss 0.6168, Train Acc 0.7800, Val Acc 0.5400 (Best 0.5400), Test Acc 0.5620 (Best 0.5620)
Loss 0.4694, Train Acc 0.8575, Val Acc 0.6900 (Best 0.6900), Test Acc 0.6474 (Best 0.6474)
Loss 0.3237, Train Acc 0.9588, Val Acc 0.7500 (Best 0.7500), Test Acc 0.7178 (Best 0.7178)
Loss 0.2142, Train Acc 0.9825, Val Acc 0.7700 (Best 0.7900), Test Acc 0.7889 (Best 0.7764)
Loss 0.1441, Train Acc 0.9962, Val Acc 0.7700 (Best 0.7900), Test Acc 0.7898 (Best 0.7764)
Loss 0.1018, Train Acc 1.0000, Val Acc 0.7800 (Best 0.7900), Test Acc 0.7931 (Best 0.7764)
Loss 0.0754, Train Acc 0.9987, Val Acc 0.7800 (Best 0.7900), Test Acc 0.7956 (Best 0.7764)