value means srcID, dstID

In [104]:
import dgl
import torch as th
import h5py
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pygraphviz as pgv
import torch
import torch.nn as nn
import torch.nn.functional as F

#### Build the graph

In [105]:
f1 = h5py.File('./mesh_path.h5','r')
f2 = h5py.File('./path_reac.h5','r')
f3 = h5py.File('./reac_reac.h5','r')

In [106]:
src1 = f1['src']
src2 = f2['src']
src3 = f3['src']
dis1 = f1['dis']
dis2 = f2['dis']
dis3 = f3['dis']

In [107]:
graph_data = {
    ('meshDescriptor', 'conclude', 'pathway'): (th.tensor(src1), th.tensor(dis1)),
    ('pathway', 'hasEvent', 'reaction'): (th.tensor(src2), th.tensor(dis2)),
    ('reaction', 'precedingEvent', 'reaction'): (th.tensor(src3), th.tensor(dis3)),
    ('pathway', 'belong', 'meshDescriptor'): (th.tensor(dis1), th.tensor(src1)),
    ('reaction', 'eventOf', 'pathway'): (th.tensor(dis2), th.tensor(src2)),
    ('reaction', 'laterEvent', 'reaction'): (th.tensor(dis3), th.tensor(src3))
}
G = dgl.heterograph(graph_data)
G

Graph(num_nodes={'meshDescriptor': 121, 'pathway': 209, 'reaction': 1277},
      num_edges={('meshDescriptor', 'conclude', 'pathway'): 345, ('pathway', 'hasEvent', 'reaction'): 1289, ('reaction', 'precedingEvent', 'reaction'): 955, ('pathway', 'belong', 'meshDescriptor'): 345, ('reaction', 'eventOf', 'pathway'): 1289, ('reaction', 'laterEvent', 'reaction'): 955},
      metagraph=[('meshDescriptor', 'pathway'), ('pathway', 'reaction'), ('pathway', 'meshDescriptor'), ('reaction', 'reaction'), ('reaction', 'reaction'), ('reaction', 'pathway')])

In [108]:
#G.nodes('pathway')

In [109]:
print('Node types:', G.ntypes)
print('Edge types:', G.etypes)
print('Canonical edge types:', G.canonical_etypes)

Node types: ['meshDescriptor', 'pathway', 'reaction']
Edge types: ['conclude', 'hasEvent', 'precedingEvent', 'belong', 'eventOf', 'laterEvent']
Canonical edge types: [('meshDescriptor', 'conclude', 'pathway'), ('pathway', 'hasEvent', 'reaction'), ('reaction', 'precedingEvent', 'reaction'), ('pathway', 'belong', 'meshDescriptor'), ('reaction', 'eventOf', 'pathway'), ('reaction', 'laterEvent', 'reaction')]


In [110]:
f4 = h5py.File('./label_path.h5','r')
label_pathway = f4['label_pathway']
label_pathway

<HDF5 dataset "label_pathway": shape (209,), type "<i8">

In [111]:
label_pathway = torch.tensor(label_pathway).long()
len(label_pathway)
labels = label_pathway
labels

tensor([  0,   0, 117,   0, 117,  58,   0, 102,  14, 117, 117,   1,   2,  10,
          2,  39,  40,  14,  74, 110,  26, 107,  14,  75,  11,  19,  73,  59,
          8,  11,  62,   9,  62,  97,   9,   9,   9,   9,  12,  11,  28,  45,
         11,  26,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  45,  26,
         11,  11,  24,  11,  11,  45,  75,  11,  26,  86,  73,  11,  75,  11,
         86,  14, 109,  14,  39, 100,  14,  58,  14,  14, 104,  19,  19,  31,
        119,  21,  25,  25,  26,  26,  26,  26,  45,  28,  40,  28,  29,  33,
         33,  33,  33,  75,  38,  39,  40, 102,  40,  40,  40,  63,  95,  41,
         41,  43,  93,  97,  97,  49,  67,  53,  54,  89,  89,  57, 120,  58,
         67,  58,  58, 103,  58,  58, 102,  69,  58,  60,  60,  62,  62,  62,
         63,  65,  65,  65,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,
         67,  67,  67,  67,  75,  75,  75,  75,  75,  75,  75,  75,  75,  75,
         75,  75,  75,  75,  75,  75,  75,  75,  75,  75,  75,  

In [112]:
pid = list(range(209))
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:180]).long()
val_idx = torch.tensor(shuffle[160:200]).long()
test_idx = torch.tensor(shuffle[200:]).long()

In [113]:
import dgl.function as fn
class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        # W_r for each relation
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })
    def forward(self, G, feat_dict):
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            Wh = self.weight[etype](feat_dict[srctype])
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        G.multi_update_all(funcs, 'sum')
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

In [114]:
class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()
        # embedding layer for each nodes
        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)
    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)
        return h_dict['pathway']

In [117]:
model = HeteroRGCN(G, 10, 10, 208)
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
best_val_acc = 0
best_test_acc = 0
for epoch in range(100):
    logits = model(G)
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    pred = logits.argmax(1)
    train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
    val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
    test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 5 == 0:
        print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))

Loss 5.2890, Train Acc 0.0389, Val Acc 0.0250 (Best 0.0250), Test Acc 0.0000 (Best 0.0000)
Loss 4.9694, Train Acc 0.0500, Val Acc 0.0500 (Best 0.0500), Test Acc 0.0000 (Best 0.0000)
Loss 4.5734, Train Acc 0.0500, Val Acc 0.0500 (Best 0.0500), Test Acc 0.0000 (Best 0.0000)
Loss 4.0820, Train Acc 0.1167, Val Acc 0.1250 (Best 0.1250), Test Acc 0.1111 (Best 0.1111)
Loss 3.5576, Train Acc 0.1833, Val Acc 0.1500 (Best 0.1500), Test Acc 0.1111 (Best 0.1111)
Loss 3.0865, Train Acc 0.2167, Val Acc 0.2000 (Best 0.2000), Test Acc 0.1111 (Best 0.1111)
Loss 2.6789, Train Acc 0.2222, Val Acc 0.2000 (Best 0.2000), Test Acc 0.1111 (Best 0.1111)
Loss 2.2919, Train Acc 0.3778, Val Acc 0.3250 (Best 0.3250), Test Acc 0.2222 (Best 0.2222)
Loss 1.9160, Train Acc 0.5389, Val Acc 0.4000 (Best 0.4000), Test Acc 0.2222 (Best 0.2222)
Loss 1.5626, Train Acc 0.6667, Val Acc 0.3750 (Best 0.4000), Test Acc 0.2222 (Best 0.2222)
Loss 1.2382, Train Acc 0.7778, Val Acc 0.5000 (Best 0.5000), Test Acc 0.2222 (Best 0.2222)

#### Tutorial examples

In [61]:
import scipy.io
import urllib.request

#data_url = 'https://data.dgl.ai/dataset/ACM.mat'
#data_file_path = '/tmp/ACM.mat'

#urllib.request.urlretrieve(data_url, data_file_path)
data = scipy.io.loadmat('./ACM.mat')
print(list(data.keys()))
type(data)


['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']


dict

In [47]:
#print(data['PvsA'])

In [62]:
G = dgl.heterograph({
        ('paper', 'written-by', 'author') : data['PvsA'],
        ('author', 'writing', 'paper') : data['PvsA'].transpose(),
        ('paper', 'citing', 'paper') : data['PvsP'],
        ('paper', 'cited', 'paper') : data['PvsP'].transpose(),
        ('paper', 'is-about', 'subject') : data['PvsL'],
        ('subject', 'has', 'paper') : data['PvsL'].transpose(),
    })
print(G)

Graph(num_nodes={'author': 17431, 'paper': 12499, 'subject': 73},
      num_edges={('paper', 'written-by', 'author'): 37055, ('author', 'writing', 'paper'): 37055, ('paper', 'citing', 'paper'): 30789, ('paper', 'cited', 'paper'): 30789, ('paper', 'is-about', 'subject'): 12499, ('subject', 'has', 'paper'): 12499},
      metagraph=[('author', 'paper'), ('paper', 'author'), ('paper', 'paper'), ('paper', 'paper'), ('paper', 'subject'), ('subject', 'paper')])


In [49]:
#G.nodes['paper'].data['m']

In [220]:
# 使用 graphviz创建元图.
def plot_graph(nxg):
    ag = pgv.AGraph(strict=False, directed=True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    ag.layout('dot')
    ag.draw('graph.png')
plot_graph(G.metagraph)

In [50]:
pvc = data['PvsC'].tocsr()
# 找到所有在 KDD, ICML, VLDB三个会议论文上发表的文章
c_selected = [0, 11, 13]  # 三个数字分别代表KDD, ICML, VLDB
p_selected = pvc[:, c_selected].tocoo() # 选出在

# 生成标记
labels = pvc.indices
labels[labels == 11] = 1
labels[labels == 13] = 2
labels = torch.tensor(labels).long()

# 拆分产生训练集、验证集和测试集
pid = p_selected.row
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:800]).long()
val_idx = torch.tensor(shuffle[800:900]).long()
test_idx = torch.tensor(shuffle[900:]).long()

pid

array([    0,     1,     2, ..., 10805, 10806, 12420], dtype=int32)

In [51]:
#G.nodes['paper'].data['m']

In [54]:
# 创建模型. 模型有三个输出，分别对应任务中的三个会议
model = HeteroRGCN(G, 10, 10, 3)
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
best_val_acc = 0
best_test_acc = 0
for epoch in range(100):
    logits = model(G)
    # 仅针对标记节点计算损失。
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    pred = logits.argmax(1)
    train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
    val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
    test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 5 == 0:
        print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))

Loss 1.0725, Train Acc 0.4238, Val Acc 0.4800 (Best 0.4800), Test Acc 0.4146 (Best 0.4146)
Loss 0.9391, Train Acc 0.4888, Val Acc 0.5700 (Best 0.5800), Test Acc 0.4690 (Best 0.5092)
Loss 0.8118, Train Acc 0.5663, Val Acc 0.5300 (Best 0.5800), Test Acc 0.5151 (Best 0.5092)
Loss 0.6347, Train Acc 0.8050, Val Acc 0.6500 (Best 0.6500), Test Acc 0.5955 (Best 0.5955)
Loss 0.4281, Train Acc 0.9688, Val Acc 0.7400 (Best 0.7400), Test Acc 0.7010 (Best 0.7010)
Loss 0.2608, Train Acc 0.9937, Val Acc 0.7800 (Best 0.8100), Test Acc 0.7563 (Best 0.7395)
Loss 0.1545, Train Acc 0.9975, Val Acc 0.7800 (Best 0.8100), Test Acc 0.7613 (Best 0.7395)
Loss 0.0938, Train Acc 1.0000, Val Acc 0.7700 (Best 0.8100), Test Acc 0.7621 (Best 0.7395)
Loss 0.0619, Train Acc 1.0000, Val Acc 0.7500 (Best 0.8100), Test Acc 0.7513 (Best 0.7395)
Loss 0.0462, Train Acc 1.0000, Val Acc 0.7400 (Best 0.8100), Test Acc 0.7446 (Best 0.7395)
Loss 0.0378, Train Acc 1.0000, Val Acc 0.7200 (Best 0.8100), Test Acc 0.7420 (Best 0.7395)