### GATNE: General Attributed Multiplex Heterogeneous Network Embedding
- 每个节点在不同类型边中有不同的表示
- 分为两种模型：GATNE-T模型：直推式学习 | GATNE-I模型：归纳式学习【更优】

#### 此代码不支持Windows系统

In [26]:
import argparse
import numpy as np
import multiprocessing
import random
import math

from numpy import random
from collections import defaultdict
from operator import index
from six import iteritems
from sklearn.metrics import auc, f1_score, precision_recall_curve, roc_auc_score
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn.parameter import Parameter

### 参数配置

In [43]:
def parse_args():    
    parser = argparse.ArgumentParser()
    
    root = r"C:\Users\sss\Desktop\GATNE/"
    
    parser.add_argument('--input', type=str, default=root+'data/amazon', help='Input dataset path')
    parser.add_argument('--features', type=str, default=None, help='Input node features')
    parser.add_argument('--walk-file', type=str, default=None, help='Input random walks')
    parser.add_argument('--epoch', type=int, default=100, help='Number of epoch. Default is 100.')
    parser.add_argument('--batch-size', type=int, default=64, help='Number of batch_size. Default is 64.')
    parser.add_argument('--eval-type', type=str, default='all', help='The edge type(s) for evaluation.')
    parser.add_argument('--schema', type=str, default=None, help='The metapath schema (e.g., U-I-U,I-U-I).')
    parser.add_argument('--dimensions', type=int, default=200, help='Number of dimensions. Default is 200.')
    parser.add_argument('--edge-dim', type=int, default=10, help='Number of edge embedding dimensions. Default is 10.')
    parser.add_argument('--att-dim', type=int, default=20, help='Number of attention dimensions. Default is 20.')
    parser.add_argument('--walk-length', type=int, default=10, help='Length of walk per source. Default is 10.')
    parser.add_argument('--num-walks', type=int, default=20, help='Number of walks per source. Default is 20.')
    parser.add_argument('--window-size', type=int, default=5, help='Context size for optimization. Default is 5.')
    parser.add_argument('--negative-samples', type=int, default=5, help='Negative samples for optimization. Default is 5.')
    parser.add_argument('--neighbor-samples', type=int, default=10, help='Neighbor samples for aggregation. Default is 10.')
    parser.add_argument('--patience', type=int, default=5, help='Early stopping patience. Default is 5.')
    parser.add_argument('--num-workers', type=int, default=16, help='Number of workers for generating random walks. Default is 16.')
    
    return parser.parse_args(args=[])

In [10]:
class Vocab(object):
    def __init__(self, count, index):
        self.count = count
        self.index = index

### 数据加载及预处理

In [11]:
# 节点与节点的连接关系

def load_training_data(f_name):
    print('We are loading data from:', f_name)
    edge_data_by_type = dict()  # 每个type对应到的相连接节点
    all_nodes = list()  # 所有节点的集合
    with open(f_name, 'r') as f:
        for line in f:
            words = line[:-1].split(' ')  # 去除/n
            if words[0] not in edge_data_by_type:  # edge type涉及到的节点
                edge_data_by_type[words[0]] = list()
            x, y = words[1], words[2]
            edge_data_by_type[words[0]].append((x, y))
            all_nodes.append(x)
            all_nodes.append(y)
    all_nodes = list(set(all_nodes))  # nodes去重
    print('Total training nodes: ' + str(len(all_nodes)))
    return edge_data_by_type  # 每个type连接的点边情况

In [12]:
# 节点与节点的连接关系【有真假之分】

def load_testing_data(f_name):
    print('We are loading data from:', f_name)
    true_edge_data_by_type = dict()  # true样本
    false_edge_data_by_type = dict()  # false样本
    all_nodes = list()
    with open(f_name, 'r') as f:
        for line in f:
            words = line[:-1].split(' ')
            x, y = words[1], words[2]
            if int(words[3]) == 1:  # true 对应到的节点
                if words[0] not in true_edge_data_by_type:
                    true_edge_data_by_type[words[0]] = list()  # true对应到的type相连接节点
                true_edge_data_by_type[words[0]].append((x, y))
            else:  # false 对应到的节点
                if words[0] not in false_edge_data_by_type:
                    false_edge_data_by_type[words[0]] = list()
                false_edge_data_by_type[words[0]].append((x, y))
            all_nodes.append(x)
            all_nodes.append(y)
    all_nodes = list(set(all_nodes))
    return true_edge_data_by_type, false_edge_data_by_type

In [13]:
# 节点特征

def load_feature_data(f_name):
    feature_dic = {}
    with open(f_name, 'r') as f:
        first = True
        for line in f:
            if first:
                first = False
                continue
            items = line.strip().split()
            feature_dic[items[0]] = items[1:]
    return feature_dic

### 随机游走生成MetaPath[采样][负采样]
- walk + initializer -- RWGraph

In [14]:
def walk(args):
    walk_length, start, schema = args
    # 随机游走
    rand = random.Random()
    
    if schema:
        schema_items = schema.split("-")
        assert schema_items[0] == schema_items[-1]  # metapath前后一致; A-B-A
    
    walk = [start]
    while len(walk) < walk_length:
        cur = walk[-1]  # 当前节点
        candidates = []
        for node in G[cur]:  # 和cur节点相连接的节点; 候选节点
            if schema == "" or node_type[node] == schema_items[len(walk) % (len(schema_items) - 1)]:
                candidates.append(node)
        if candidates:
            walk.append(rand.choice(candidates))
        else:
            break
            
    return [str(node) for node in walk]

In [15]:
def initializer(init_G, init_node_type):
    global G
    G = init_G
    global node_type
    node_type = init_node_type

In [16]:
class RWGraph():
    def __init__(self, nx_G, node_type_arr=None, num_workers=0):
        self.G = nx_G
        self.node_type = node_type_arr
        self.num_workers = num_workers
        
    def node_list(self, nodes, num_walks):  
        for loop in range(num_walks):  # 循环num_walks次数
            for node in nodes:
                yield node
                
    def simulate_walks(self, num_walks, walk_length, schema=None):
        all_walks = []
        nodes = list(self.G.keys())  # 节点顶点数量
        random.shuffle(nodes)
        
        if schema is None:
            with multiprocessing.Pool(self.num_workers, initializer=initializer, initargs=(self.G, self.node_type)) as pool:
                all_walks = list(pool.imap(walk, ((walk_length, node, '') for node in tqdm(self.node_list(nodes, num_walks))), chunksize=256))
        else:
            schema_list = schema.split(',')
            for schema_iter in schema_list:
                with multiprocessing.Pool(self.num_workers, initializer=initializer, initargs=(self.G, self.node_type)) as pool:
                    walks = list(pool.imap(walk, ((walk_length, node, schema_iter) for node in tqdm(self.node_list(nodes, num_walks)) if schema_iter.split('-')[0] == self.node_type[node]), chunksize=512))
                all_walks.extend(walks)
                
        return all_walks

- load_node_type + get_G_from_edges + RWGraph -- generate_walks

In [17]:
def load_node_type(f_name):
    print("We are loading node type from:", f_name)
    node_type = {}
    with open(f_name, "r") as f:
        for line in f:
            items = line.strip().split()
            node_type[items[0]] = items[1]
    return node_type

In [18]:
def get_G_from_edges(edges):
    edge_dict = defaultdict(set)
    for edge in edges:
        u, v = str(edge[0]), str(edge[1])
        edge_dict[u].add(v)
        edge_dict[v].add(u)
    return edge_dict  # 每个节点和它相连接的节点

In [19]:
def generate_walks(network_data, num_walks, walk_length, schema, file_name, num_workers):
    if schema is not None:  # schema：节点类型
        # load_node_type
        node_type = load_node_type(file_name + '/node_type.txt')
    else:
        node_type = None

    all_walks = []  # 所有游走的list
    for layer_id, layer_name in enumerate(network_data):
        tmp_data = network_data[layer_name]  # 每个type对应到的点边信息
        # start to do the random walk on a layer
        # get_G_from_edges(tmp_data): 每个节点对应到相连接的点

        # get_G_from_edges
        layer_walker = RWGraph(get_G_from_edges(tmp_data), node_type, num_workers)  # RandomWalk Graph
        print('Generating random walks for layer', layer_id)
        layer_walks = layer_walker.simulate_walks(num_walks, walk_length, schema=schema)  # 生成随机游走的序列; 每个节点游走次数; 游走长度;

        all_walks.append(layer_walks)

    print('Finish generating the walks')

    return all_walks

- generate_walks + generate_vocab + generate_pairs + load_walks + save_walks -- generate

In [20]:
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(int)
    # 随机游走每个单词出现的次数
    for layer_id, walks in enumerate(all_walks):  # 按照type类别
        print('Counting vocab for layer', layer_id)
        for walk in tqdm(walks):
            for word in walk:  # 记录每个单词出现的次数
                raw_vocab[word] += 1

    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(count=v, index=len(index2word))  # 用一个类表示节点的次数和index
        index2word.append(word)

    index2word.sort(key=lambda word: vocab[word].count, reverse=True)  # 按出现次数降序排列
    for i, word in enumerate(index2word):  # 节点按出现次数降序更新索引
        vocab[word].index = i
    # 节点统计信息; 节点信息
    return vocab, index2word

In [21]:
def generate_pairs(all_walks, vocab, window_size, num_workers):
    pairs = []
    skip_window = window_size // 2
    for layer_id, walks in enumerate(all_walks):
        print('Generating training pairs for layer', layer_id)
        for walk in tqdm(walks):  # type的游走语料进行循环
            for i in range(len(walk)):  # 每个单词循环
                for j in range(1, skip_window + 1):  # 向前向后的窗口长度
                    if i - j >= 0:
                        pairs.append((vocab[walk[i]].index, vocab[walk[i - j]].index, layer_id))  # 向前窗口涉及到的单词
                    if i + j < len(walk):
                        pairs.append((vocab[walk[i]].index, vocab[walk[i + j]].index, layer_id))  # 向后窗口涉及到的单词
    return pairs  # 所有单词上线文的索引, type

In [22]:
def load_walks(walk_file):
    print('Loading walks')
    all_walks = []
    with open(walk_file, 'r') as f:
        for line in f:
            content = line.strip().split()
            layer_id = int(content[0])
            if layer_id >= len(all_walks):
                all_walks.append([])
            all_walks[layer_id].append(content[1:])
    return all_walks

In [23]:
def save_walks(walk_file, all_walks):
    with open(walk_file, 'w') as f:
        for layer_id, walks in enumerate(all_walks):
            print('Saving walks for layer', layer_id)
            for walk in tqdm(walks):
                f.write(' '.join([str(layer_id)] + [str(x) for x in walk]) + '\n')

In [24]:
def generate(network_data, num_walks, walk_length, schema, file_name, window_size, num_workers, walk_file):
    if walk_file is not None:  # 如果没有walk_file，则自己生成walks
        all_walks = load_walks(walk_file)
    else:  # 边的信息; 每个节点游走次数; 游走长度;
        all_walks = generate_walks(network_data, num_walks, walk_length, schema, file_name, num_workers)  # 生成随机游走序列
        save_walks(file_name + '/walks.txt', all_walks)
    vocab, index2word = generate_vocab(all_walks)  # 生成节点index; 降序方式;
    train_pairs = generate_pairs(all_walks, vocab, window_size, num_workers)  # (center, context, type)
    # vocab:节点统计信息; index2word:节点; train_pairs:skip-gram训练样本
    return vocab, index2word, train_pairs

In [25]:
def generate_neighbors(network_data, vocab, num_nodes, edge_types, neighbor_samples):
    edge_type_count = len(edge_types)
    neighbors = [[[] for __ in range(edge_type_count)] for _ in range(num_nodes)]
    for r in range(edge_type_count):
        print('Generating neighbors for layer', r)
        g = network_data[edge_types[r]]  # 每个type涉及到的节点
        for (x, y) in tqdm(g):
            ix = vocab[x].index  # x对应到的索引
            iy = vocab[y].index  # y对应到的索引
            neighbors[ix][r].append(iy)  # 邻居信息
            neighbors[iy][r].append(ix)
        for i in range(num_nodes):
            if len(neighbors[i][r]) == 0:  # 节点在这个类别下，如果没有节点和它连接，邻居就是该节点本身
                neighbors[i][r] = [i] * neighbor_samples
            elif len(neighbors[i][r]) < neighbor_samples:  # 如果邻居节点数量小于采样邻居数量，进行重采样
                neighbors[i][r].extend(list(np.random.choice(neighbors[i][r], size=neighbor_samples-len(neighbors[i][r]))))
            elif len(neighbors[i][r]) > neighbor_samples:  # 如果邻居节点数量大于采样邻居数量，进行邻居大小数量的采样
                neighbors[i][r] = list(np.random.choice(neighbors[i][r], size=neighbor_samples))
    return neighbors  # 每个节点的邻居采样

- 指定一个batch大小的数据集

In [27]:
def get_batches(pairs, neighbors, batch_size):
    n_batches = (len(pairs) + (batch_size - 1)) // batch_size  # 迭代次数
    
    for idx in range(n_batches):
        x, y, t, neigh = [], [], [], []  # src, dst, type, neigh
        for i in range(batch_size):
            index = idx * batch_size + i
            if index >= len(pairs):
                break
            x.append(pairs[index][0])
            y.append(pairs[index][1])
            t.append(pairs[index][2])
            neigh.append(neighbors[pairs[index][0]])  # src的邻居节点
        yield torch.tensor(x), torch.tensor(y), torch.tensor(t), torch.tensor(neigh)

### 构建模型

In [28]:
class GATNEModel(nn.Module):
    def __init__(self, num_nodes, embedding_size, embedding_u_size, edge_type_count, dim_a, features):
        super(GATNEModel, self).__init__()
        self.num_nodes = num_nodes  # 节点数量
        self.embedding_size = embedding_size  # 每个节点输出的embedding_size
        self.embedding_u_size = embedding_u_size  # 节点作为邻居初始化size
        self.edge_type_count = edge_type_count  # 类别数量
        self.dim_a = dim_a  # 中间隐藏层特征数量
        
        self.features = None
        if features is not None:  # GATNE-I
            self.features = features
            feature_dim = self.features.shape[-1]
            self.embed_trans = Parameter(
                torch.FloatTensor(feature_dim, embedding_size)
            )  # [142, 200]; bi-base embedding
            self.u_embed_trans = Parameter(
                torch.FloatTensor(edge_type_count, feature_dim, embedding_u_size)
            )  # [2, 142, 10]; 初始化ui
        else:  # 初始化 base embedding GATNE-T
            self.node_embeddings = Parameter(
                torch.FloatTensor(num_nodes, embedding_size)
            )  # [511, 200]
            self.node_type_embeddings = Parameter(  # 初始化 edge embedding
                torch.FloatTensor(num_nodes, edge_type_count, embedding_u_size)
            )  #  [511, 2, 10]
        self.trans_weights = Parameter(  # [2, 10, 200]: 定义Mr矩阵
            torch.FloatTensor(edge_type_count, embedding_u_size, embedding_size)
        )
        self.trans_weights_s1 = Parameter(  # [2, 10, 20]  计算attention使用
            torch.FloatTensor(edge_type_count, embedding_u_size, dim_a)
        )
        self.trans_weights_s2 = Parameter(
            torch.FloatTensor(edge_type_count, dim_a, 1)
        )  # [2, 20, 1]
        self.reset_parameters()
        
    def reset_parameters(self):
        if self.features is not None:
            self.embed_trans.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
            self.u_embed_trans.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        else:
            self.node_embeddings.data.uniform_(-1.0, 1.0)
            self.node_type_embeddings.data.uniform_(-1.0, 1.0)
        self.trans_weights.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        self.trans_weights_s1.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        self.trans_weights_s2.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
    
    def forward(self, train_inputs, train_types, node_neigh):
        if self.features is None:
            node_embed = self.node_embeddings[train_inputs]  # 每个节点对应的embedding
            node_embed_neighbors = self.node_type_embeddings[node_neigh]  # 每个节点对应的neighbors
        else:  # self.features:节点特征; self.embed_trans
            node_embed = torch.mm(self.features[train_inputs], self.embed_trans)  # [64, 200]
            node_embed_neighbors = torch.einsum('bijk,akm->bijam', self.features[node_neigh], self.u_embed_trans)  # 生成ui; [64, 2, 10, 142] * [2, 142, 10]
        node_embed_tmp = torch.cat([
            node_embed_neighbors[:, i, :, i, :].unsqueeze(1)  # [64, 1, 10, 10]
            for i in range(self.edge_type_count)
        ], dim = 1)
        node_type_embed = torch.sum(node_embed_tmp, dim=2)  # Ui; 对邻居信息求和; [64, 2, 10]
        
        trans_w = self.trans_weights[train_types]  # [64, 10, 200]
        trans_w_s1 = self.trans_weights_s1[train_types]  # [64, 10, 20]
        trans_w_s2 = self.trans_weights_s2[train_types]  # [64, 20, 1]
        
        attention = F.softmax(  # [64, 1, 2]
            torch.matmul(
                torch.tanh(torch.matmul(node_type_embed, trans_w_s1)), trans_w_s2
            ).squeeze(2),
            dim = 1
        ).unsqueeze(1)
        node_type_embed = torch.matmul(attention, node_type_embed)  # [64, 1, 2] * [64, 2, 10] 对node_type_embed做attention求和
        node_embed = node_embed + torch.matmul(node_type_embed, trans_w).squeeze(1)  # [64, 200] + [64, 1, 10] * [64, 10, 200] => [64, 200]
        
        last_node_embed = F.normalize(node_embed, dim=1)  # dim=1, L2-norm; (last_node_embed * last_node_embed).sum(axis=1)
        
        return last_node_embed

### 损失函数

In [48]:
class NSLoss(nn.Module):
    def __init__(self, num_nodes, num_sampled, embedding_size):
        super(NSLoss, self).__init__()
        self.num_nodes = num_nodes
        self.num_sampled = num_sampled
        self.embedding_size = embedding_size
        self.weights = Parameter(
            torch.FloatTensor(num_nodes, embedding_size)
        )  # [511, 200]; Cj
        self.sample_weights = F.normalize(  # [511]; 对节点进行初始化
            torch.Tensor([
                (math.log(k + 2) - math.log(k + 1)) / math.log(num_nodes + 1)
                for k in range(num_nodes)
            ]),
            dim=0,
        )
        self.reset_parameters()
        
    def reset_parameters(self):
        self.weights.data.normal_(std=1.0 / math.sqrt(self.embedding_size))
        
    def forward(self, input, embs, label):
        n = input.shape[0]
        log_target = torch.log(  # torch.mul: 对应位置相乘
            torch.sigmoid(torch.sum(torch.mul(embs, self.weights[label]), 1))  # sigmoid([64]); input_embeddings*labels_embeddings
        )
        negs = torch.multinomial(  # 抽样函数，self.sample_weights的权重抽样
            self.sample_weights, self.num_sampled * n, replacement=True
        ).view(n, self.num_sampled)
        
        noise = torch.neg(self.weights[negs])  # Cj; 所有值 * -1
        sum_log_sampled = torch.sum(
            torch.log(torch.sigmoid(torch.bmm(noise, embs.unsqueeze(2)))), 1  # [64, 5, 1]
        ).squeeze()
        
        loss = log_target + sum_log_sampled
        return -loss.sum() / n

### 模型训练

In [49]:
def train_model(network_data, feature_dic):
    vocab, index2word, train_pairs = generate(network_data, args.num_walks, args.walk_length, args.schema, file_name, args.window_size, args.num_workers, args.walk_file)
    # 生成随机游走训练序列和训练语料
    edge_types = list(network_data.keys())  # 边的类别

    num_nodes = len(index2word)  # 节点数量
    edge_type_count = len(edge_types)  # 类别数量
    epochs = args.epoch
    batch_size = args.batch_size
    embedding_size = args.dimensions  # base embedding_size; embedding_size;
    embedding_u_size = args.edge_dim  # edge_embedding_size
    u_num = edge_type_count
    num_sampled = args.negative_samples
    dim_a = args.att_dim  # 计算attention的中间变量维度
    att_head = 1
    neighbor_samples = args.neighbor_samples  # 邻居采样数量

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # 每个类别节点的邻居节点; 在计算ui时使用;
    neighbors = generate_neighbors(network_data, vocab, num_nodes, edge_types, neighbor_samples)

    features = None
    if feature_dic is not None:  # GATNE-I
        feature_dim = len(list(feature_dic.values())[0])  # 特征长度
        print('feature dimension: ' + str(feature_dim))
        features = np.zeros((num_nodes, feature_dim), dtype=np.float32)  # feature array
        for key, value in feature_dic.items():
            if key in vocab:
                features[vocab[key].index, :] = np.array(value)
        features = torch.FloatTensor(features).to(device)  # 特征矩阵
    # 建立GATNE model
    model = GATNEModel(
        num_nodes, embedding_size, embedding_u_size, edge_type_count, dim_a, features
    )
    nsloss = NSLoss(num_nodes, num_sampled, embedding_size)

    model.to(device)
    nsloss.to(device)

    optimizer = torch.optim.Adam(
        [{"params": model.parameters()}, {"params": nsloss.parameters()}], lr=1e-4
    )

    best_score = 0
    test_score = (0.0, 0.0, 0.0)
    patience = 0
    for epoch in range(epochs):
        random.shuffle(train_pairs)
        batches = get_batches(train_pairs, neighbors, batch_size)

        data_iter = tqdm(
            batches,
            desc="epoch %d" % (epoch),
            total=(len(train_pairs) + (batch_size - 1)) // batch_size,
            bar_format="{l_bar}{r_bar}",
        )
        avg_loss = 0.0

        for i, data in enumerate(data_iter):
            optimizer.zero_grad()  # center, context, types, neigh
            embs = model(data[0].to(device), data[2].to(device), data[3].to(device),)  # 节点的embeddings
            loss = nsloss(data[0].to(device), embs, data[1].to(device))
            loss.backward()
            optimizer.step()

            avg_loss += loss.item()

            if i % 5000 == 0:
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "avg_loss": avg_loss / (i + 1),
                    "loss": loss.item(),
                }
                data_iter.write(str(post_fix))

            '''  调试使用  '''
            if i==0:
                break

        final_model = dict(zip(edge_types, [dict() for _ in range(edge_type_count)]))  # 每个类别下节点的embedding;
        for i in range(num_nodes):
            train_inputs = torch.tensor([i for _ in range(edge_type_count)]).to(device)  # 节点的多个类别，求每个类别下的embedding
            train_types = torch.tensor(list(range(edge_type_count))).to(device)
            node_neigh = torch.tensor(  # 节点在每个类别下的neighbors
                [neighbors[i] for _ in range(edge_type_count)]
            ).to(device)
            node_emb = model(train_inputs, train_types, node_neigh)  # [node1, node1]; [type1, type2]; [node1_neigh, node1_neigh]
            for j in range(edge_type_count):  # 每个节点在各个类别下的embedding
                final_model[edge_types[j]][index2word[i]] = (
                    node_emb[j].cpu().detach().numpy()
                )

        valid_aucs, valid_f1s, valid_prs = [], [], []
        test_aucs, test_f1s, test_prs = [], [], []
        for i in range(edge_type_count):
            if args.eval_type == "all" or edge_types[i] in args.eval_type.split(","):
                tmp_auc, tmp_f1, tmp_pr = evaluate(
                    final_model[edge_types[i]],
                    valid_true_data_by_edge[edge_types[i]],
                    valid_false_data_by_edge[edge_types[i]],
                )
                valid_aucs.append(tmp_auc)
                valid_f1s.append(tmp_f1)
                valid_prs.append(tmp_pr)

                tmp_auc, tmp_f1, tmp_pr = evaluate(
                    final_model[edge_types[i]],
                    testing_true_data_by_edge[edge_types[i]],
                    testing_false_data_by_edge[edge_types[i]],
                )
                test_aucs.append(tmp_auc)
                test_f1s.append(tmp_f1)
                test_prs.append(tmp_pr)
        print("valid auc:", np.mean(valid_aucs))
        print("valid pr:", np.mean(valid_prs))
        print("valid f1:", np.mean(valid_f1s))

        average_auc = np.mean(test_aucs)
        average_f1 = np.mean(test_f1s)
        average_pr = np.mean(test_prs)

        cur_score = np.mean(valid_aucs)
        if cur_score > best_score:
            best_score = cur_score
            test_score = (average_auc, average_f1, average_pr)
            patience = 0
        else:
            patience += 1
            if patience > args.patience:
                print("Early Stopping")
                break
    return test_score

- 模型评价

In [50]:
def get_score(local_model, node1, node2):
    try:
        vector1 = local_model[node1]
        vector2 = local_model[node2]
        return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    except Exception as e:
        pass

### 模型评估

In [51]:
def evaluate(model, true_edges, false_edges):
    true_list = list()
    prediction_list = list()
    true_num = 0
    for edge in true_edges:
        tmp_score = get_score(model, str(edge[0]), str(edge[1]))
        if tmp_score is not None:
            true_list.append(1)
            prediction_list.append(tmp_score)
            true_num += 1

    for edge in false_edges:
        tmp_score = get_score(model, str(edge[0]), str(edge[1]))
        if tmp_score is not None:
            true_list.append(0)
            prediction_list.append(tmp_score)

    sorted_pred = prediction_list[:]
    sorted_pred.sort()
    threshold = sorted_pred[-true_num]

    y_pred = np.zeros(len(prediction_list), dtype=np.int32)
    for i in range(len(prediction_list)):
        if prediction_list[i] >= threshold:
            y_pred[i] = 1  # 预测输出的结果

    y_true = np.array(true_list)  # true label
    y_scores = np.array(prediction_list)  # predict proba
    ps, rs, _ = precision_recall_curve(y_true, y_scores)
    return roc_auc_score(y_true, y_scores), f1_score(y_true, y_pred), auc(rs, ps)

### 主函数

In [52]:
if __name__ == "__main__":
    args = parse_args()
    args.input = r'C:\Users\sss\Desktop\GATNE\data\example'
    args.features = r'C:\Users\sss\Desktop\GATNE\data\example/feature.txt'
    file_name = args.input
    print(args)
    if args.features is not None:  # 每个节点对应到的特征; GATNE-T;
        feature_dic = load_feature_data(args.features)
    else:
        feature_dic = None

    training_data_by_type = load_training_data(file_name + "/train.txt")
    valid_true_data_by_edge, valid_false_data_by_edge = load_testing_data(
        file_name + "/valid.txt"
    )
    testing_true_data_by_edge, testing_false_data_by_edge = load_testing_data(
        file_name + "/test.txt"
    )

    average_auc, average_f1, average_pr = train_model(training_data_by_type, feature_dic)

    print("Overall ROC-AUC:", average_auc)
    print("Overall PR-AUC", average_pr)
    print("Overall F1:", average_f1)

Namespace(att_dim=20, batch_size=64, dimensions=200, edge_dim=10, epoch=100, eval_type='all', features='C:\\Users\\sss\\Desktop\\GATNE\\data\\example/feature.txt', input='C:\\Users\\sss\\Desktop\\GATNE\\data\\example', negative_samples=5, neighbor_samples=10, num_walks=20, num_workers=0, patience=5, schema=None, walk_file=None, walk_length=10, window_size=5)
We are loading data from: C:\Users\sss\Desktop\GATNE\data\example/train.txt
Total training nodes: 511
We are loading data from: C:\Users\sss\Desktop\GATNE\data\example/valid.txt
We are loading data from: C:\Users\sss\Desktop\GATNE\data\example/test.txt
Generating random walks for layer 0


ValueError: Number of processes must be at least 1