In [1]:
import os
import sys
import time
import math
import random
import subprocess
import logging
import argparse

import re
import json
import pickle
import math
import random


from collections import defaultdict

import numpy as np
import scipy
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm

from sklearn import linear_model
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.preprocessing import Normalizer

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
sp.csr_matrix

scipy.sparse.csr.csr_matrix

In [4]:
DATASET_NUM_DIC = {
    'epinions': 131828,
    'slashdot': 82140,
    'bitcoin_alpha': 3783,
    'bitcoin_otc': 5881,
}

### Feature Extraction

In [5]:
class FeaExtra(object):
    def __init__(self, dataset='epinions', k=1, debug=False):
        filename = r'C:\Users\sss\Desktop\SiGAT/experiment-data/{}-train-{}.edgelist'.format(dataset, k)  # 导入训练数据
        if debug:
            filename = './test.edgelists'
        res = self.init_edgelists(filename=filename)
        self.pos_in_edgelists, self.pos_out_edgelists, self.neg_in_edgelists, self.neg_out_edgelists = res

    def init_edgelists(self, filename=r'C:\Users\sss\Desktop\SiGAT/experiment-data/epinions-train-1.edgelist'):
        
        pos_out_edgelists = defaultdict(list)
        neg_out_edgelists = defaultdict(list)
        pos_in_edgelists = defaultdict(list)
        neg_in_edgelists = defaultdict(list)
        
        with open(filename) as f:
            for line in f.readlines():
                x, y, z = line.split()
                x = int(x)
                y = int(y)
                z = int(z)
                
                if z == 1:
                    pos_out_edgelists[x].append(y)  # u->v; u向外指向v的有向边
                    pos_in_edgelists[y].append(x)  # v->u: v被u指向
                else:
                    neg_out_edgelists[x].append(y)
                    neg_in_edgelists[y].append(x)
        return pos_in_edgelists, pos_out_edgelists, neg_in_edgelists, neg_out_edgelists

    def get_pos_indegree(self, v):
        return len(self.pos_in_edgelists[v])

    def get_pos_outdegree(self, v):
        return len(self.pos_out_edgelists[v])

    def get_neg_indegree(self, v):
        return len(self.neg_in_edgelists[v])

    def get_neg_outdegree(self, v):
        return len(self.neg_out_edgelists[v])

    def common_neighbors(self, u, v):
        u_neighbors = self.pos_in_edgelists[u] + self.neg_in_edgelists[u] + \
                      self.pos_out_edgelists[u] + self.neg_out_edgelists[u]
        v_neighbors = self.pos_in_edgelists[v] + self.neg_in_edgelists[v] + \
                      self.pos_out_edgelists[v] + self.neg_out_edgelists[v]
        return len(set(u_neighbors).intersection(set(v_neighbors)))

    def feature_part1(self, u, v):
        d_pos_in_u = self.get_pos_indegree(u)
        d_neg_in_v = self.get_neg_indegree(v)
        d_pos_out_u = self.get_pos_outdegree(u)
        d_neg_out_v = self.get_neg_outdegree(v)

        # d_pos_in_v = self.get_pos_indegree(v)
        # d_neg_in_u = self.get_neg_indegree(u)
        # d_pos_out_v = self.get_pos_outdegree(v)
        # d_neg_out_u = self.get_neg_outdegree(u)

        c_u_v = self.common_neighbors(u, v)
        d_out_u = self.get_neg_outdegree(u) + self.get_pos_outdegree(u)
        d_in_v = self.get_neg_indegree(v) + self.get_pos_indegree(v)
        return d_pos_in_u, d_neg_in_v, d_pos_out_u, d_neg_out_v, c_u_v, d_out_u, d_in_v

    def feature_part2(self, u, v):
        """
        /^ \v /^ \^ /v \v /v ^\
        ++
        /^ \v /^ \^ /v \v /v ^\
        +-
        /^ \v /^ \^ /v \v /v ^\
        -+
        /^ \v /^ \^ /v \v /v ^\
        --
        """
        d1_1 = len(set(self.pos_out_edgelists[u]).intersection(set(self.pos_in_edgelists[v])))  # 集合交集
        d1_2 = len(set(self.pos_out_edgelists[u]).intersection(set(self.neg_in_edgelists[v])))
        d1_3 = len(set(self.neg_out_edgelists[u]).intersection(set(self.pos_in_edgelists[v])))
        d1_4 = len(set(self.neg_out_edgelists[u]).intersection(set(self.neg_in_edgelists[v])))

        d2_1 = len(set(self.pos_out_edgelists[u]).intersection(set(self.pos_out_edgelists[v])))
        d2_2 = len(set(self.pos_out_edgelists[u]).intersection(set(self.neg_out_edgelists[v])))
        d2_3 = len(set(self.neg_out_edgelists[u]).intersection(set(self.pos_out_edgelists[v])))
        d2_4 = len(set(self.neg_out_edgelists[u]).intersection(set(self.neg_out_edgelists[v])))

        d3_1 = len(set(self.pos_in_edgelists[u]).intersection(set(self.pos_out_edgelists[v])))
        d3_2 = len(set(self.pos_in_edgelists[u]).intersection(set(self.neg_out_edgelists[v])))
        d3_3 = len(set(self.neg_in_edgelists[u]).intersection(set(self.pos_out_edgelists[v])))
        d3_4 = len(set(self.neg_in_edgelists[u]).intersection(set(self.neg_out_edgelists[v])))

        d4_1 = len(set(self.pos_in_edgelists[u]).intersection(set(self.pos_in_edgelists[v])))
        d4_2 = len(set(self.pos_in_edgelists[u]).intersection(set(self.neg_in_edgelists[v])))
        d4_3 = len(set(self.neg_in_edgelists[u]).intersection(set(self.pos_in_edgelists[v])))
        d4_4 = len(set(self.neg_in_edgelists[u]).intersection(set(self.neg_in_edgelists[v])))

        return d1_1, d1_2, d1_3, d1_4, d2_1, d2_2, d2_3, d2_4, d3_1, d3_2, d3_3, d3_4, d4_1, d4_2, d4_3, d4_4

    def get_features(self, u, v):
        x11 = self.feature_part1(u, v)
        x12 = self.feature_part2(u, v)
        return x11 + x12

### 参数设置

In [6]:
# Training settings
parser = argparse.ArgumentParser()
parser.add_argument('--devices', type=str, default='cpu', help='Devices')
parser.add_argument('--seed', type=int, default=13, help='Random seed.')
parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.001, help='Initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=0.001, help='Weight decay (L2 loss on parameters).')
parser.add_argument('--dataset', default='bitcoin_alpha', help='Dataset')
parser.add_argument('--dim', type=int, default=20, help='Embedding dimension')
parser.add_argument('--fea_dim', type=int, default=20, help='Feature embedding dimension')
parser.add_argument('--batch_size', type=int, default=500, help='Batch size')
parser.add_argument('--dropout', type=float, default=0.0, help='Dropout k')
parser.add_argument('--k', default=1, help='Folder k')
parser.add_argument('--agg', default='attention', choices=['mean', 'attantion'], help='Aggregator choose')

args = parser.parse_args(args=[])

In [7]:
# 输出路径

root = r"C:\Users\sss\Desktop\SiGAT"
OUTPUT_DIR = root + f'/embeddings/sdgnn-{args.agg}'
if not os.path.exists(root + "/" + 'embeddings'):
    os.mkdir(root + "/" + 'embeddings')

if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

In [8]:
# 随机种子

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

<torch._C.Generator at 0x1df3b5ee390>

In [9]:
NEG_LOSS_RATIO = 1
INTERVAL_PRINT = 2

NUM_NODE = DATASET_NUM_DIC[args.dataset]
WEIGHT_DECAY = args.weight_decay
NODE_FEAT_SIZE = args.fea_dim
EMBEDDING_SIZE1 = args.dim
DEVICES = torch.device(args.devices)
LEARNING_RATE = args.lr
BATCH_SIZE = args.batch_size
EPOCHS = args.epochs
DROUPOUT = args.dropout
K = args.k

In [10]:
print(DEVICES)

cpu


### Logistic Function

In [11]:
EMBEDDING_SIZE = 20

SINE_MODEL_PATH_DIC = {
    'epinions': root + '/embeddings/sine_epinions_models',
    'slashdot': root + '/embeddings/sine_slashdot_models',
    'bitcoin_alpha': root + '/embeddings/sine_bitcoin_alpha_models',
    'bitcoin_otc': root + '/embeddings/sine_bitcoin_otc_models'
}

SIDE_MODEL_PATH_DIC = {
    'epinions': root + '/embeddings/side_epinions_models',
    'slashdot': root + '/embeddings/side_slashdot_models',
    'bitcoin_alpha': root + '/embeddings/side_bitcoin_alpha_models',
    'bitcoin_otc': root + '/embeddings/side_bitcoin_otc_models'
}

In [36]:
def read_train_test_data(dataset, k):
    train_X = []
    train_y = []
    with open(root + '/experiment-data/{}-train-{}.edgelist'.format(dataset, k)) as f:
        for line in f:
            i, j, flag = line.split()
            i = int(i)
            j = int(j)
            flag = int((int(flag) + 1) / 2)
            train_X.append((i, j))
            train_y.append(flag)
    test_X = []
    test_y = []
    
    with open(root + '/experiment-data/{}-test-{}.edgelist'.format(dataset, k)) as f:
        for line in f:
            i, j, flag = line.split()
            i = int(i)
            j = int(j)
            flag = int((int(flag) + 1) / 2)
            test_X.append((i, j))
            test_y.append(flag)
            
    return np.array(train_X), np.array(train_y), np.array(test_X), np.array(test_y)

In [13]:
def common_logistic(dataset, k, embeddings, model):
    train_X, train_y, test_X, test_y  = read_train_test_data(dataset, k)
    
    train_X1 = []
    test_X1 = []
    
    for i, j in train_X:
        train_X1.append(np.concatenate([embeddings[i], embeddings[j]]))
        
    for i, j in test_X:
        test_X1.append(np.concatenate([embeddings[i], embeddings[j]]))
        
    logistic_function = linear_model.LogisticRegression()
    logistic_function.fit(train_X1, train_y)
    pred = logistic_function.predict(test_X1)
    pred_p = logistic_function.predict_proba(test_X1)
    
    pos_ratio =  np.sum(test_y) / test_y.shape[0]
    accuracy =  metrics.accuracy_score(test_y, pred)
    f1_score0 =  metrics.f1_score(test_y, pred)
    f1_score1 =  metrics.f1_score(test_y, pred, average='macro')
    f1_score2 =  metrics.f1_score(test_y, pred, average='micro')
    
    auc_score =  metrics.roc_auc_score(test_y, pred_p[:, 1])
    # print("pos_ratio:", pos_ratio)
    # print('accuracy:', accuracy)
    # print("f1_score:", f1_score0)
    # print("macro f1_score:", f1_score1)
    # print("micro f1_score:", f1_score2)
    # print("auc score:", auc_score)

    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2,  auc_score

In [14]:
def read_emb(fpath, dataset):
    dim = 0
    embeddings = 0
    with open(fpath) as f:
        for i, line in enumerate(f.readlines()):
            if i == 0:
                ll = line.split()
                assert len(ll) == 2, 'First line must be 2 numbers'
                dim = int(ll[1])
                embeddings = np.random.rand(DATASET_NUM_DIC[dataset], dim)
            else:
                line_l = line.split()
                node = line_l[0]
                emb = [float(j) for j in line_l[1:]]
                embeddings[int(node)] = np.array(emb)
                
    return embeddings

In [15]:
def logistic_embedding0(k=1, dataset='epinions'):
    """using random embedding to train logistic

    Keyword Arguments:
        k {int} -- [folder] (default: {1})
        dataset {str} -- [dataset] (default: {'epinions'})

    Returns:
        [type] -- [pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score]
    """
    
    print('random embeddings')
    embeddings = np.random.rand(DATASET_NUM_DIC[dataset], EMBEDDING_SIZE)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'random')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [16]:
def logistic_embedding1(k=1, dataset='epinions'):
    
    """use deepwalk embeddings to train logistic function

    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """

    fpath = os.path.join(root + "/" + 'embeddings/deepwalk_emb', '{}-{}.emb'.format(dataset, k))
    embeddings = read_emb(fpath, dataset)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'deepwalk')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [17]:
def logistic_embedding2(k=1, dataset='epinions'):
    
    """use node2vec embeddings to train logistic function
    
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """

    fpath = os.path.join(root + "/" + 'embeddings/node2vec_emb', '{}-{}.emb'.format(dataset, k))
    embeddings = read_emb(fpath, dataset)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'node2vec')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [18]:
def logistic_embedding3(k=1, dataset='epinions'):
    
    """use line embeddings to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    
    fpath = os.path.join(root + "/" + 'embeddings/line_emb', '{}-{}.emb'.format(dataset, k))
    embeddings = read_emb(fpath, dataset)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'line')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [19]:
def logistic_embedding4(k=1, dataset='epinions', epoch=6, dirname='graphssa-results'):
    
    """use graphssa to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    
    print('item: graphssa with feo', k, epoch)

    filename = os.path.join(root + "/" + dirname, 'embedding-{}-{}-{}.npy'.format(dataset, k, epoch))
    embeddings = np.load(filename)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'graphssa')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [20]:
def logistic_embedding5(k=1, dataset='epinions', epoch=50, v0=True):
    
    """use sine embeddings to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """

    print('sine', k, 'v0', v0)
    embeddings = []
    if v0:
        filename = os.path.join(SINE_MODEL_PATH_DIC[dataset], str(k) + 'a', str(epoch) + '.p')
    else:
        filename = os.path.join(SINE_MODEL_PATH_DIC[dataset], str(k) + 'b', str(epoch) + '.p')

    # filename = os.path.join('./models/', str(epoch) + '.p')
    print(filename)
    params = ""
    with open(filename, 'rb') as fp:
        params = pickle.load(fp)
        embeddings = params[0].get_value()
    embeddings = embeddings[1:,]
    print(embeddings.shape)
    
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'sine')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [21]:
def logistic_embedding6(k=1, dataset='epinions', epoch=1):
    
    """use side embeddings to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    
    def read_side_emb():
        voc_path = os.path.join(root + "/" + 'embeddings/side', '{}{}.vocab'.format(dataset, k))
        order_dict = defaultdict(int)
        with open(voc_path) as f:
            for index, line in enumerate(f.readlines()):
                num = re.findall(r'b\'(\d+)\'', line)
                order_dict[index] = int("".join(num))
                
        embeddings = np.zeros((DATASET_NUM_DIC[dataset], 50))
        embed_path = os.path.join(root + "/" + 'embeddings/side', '{}{}{}.emb'.format(dataset, k, epoch))
        
        with open(embed_path) as f:
            for i, line in enumerate(f.readlines()):
                line_l = line.split()
                emb = [np.float(j) for j in line_l]
                embeddings[order_dict[i]] = np.array(emb)
                
        return embeddings

    embeddings = read_side_emb()
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'side')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [22]:
def logistic_embedding7(k=1, dataset='epinions', dirname="sign2vec"):
    
    """use signet embeddings to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    
    print('signet', k, dataset)
    filename = os.path.join(root + "/" + 'embeddings', dirname, 'embeddings-{}-{}.npy'.format(dataset, k))
    embeddings = np.load(filename)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'signet')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [23]:
def logistic_embedding8(k=1, dataset='epinions'):
    """use feature to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """
    print(dataset, k, 'fea')
    train_X, train_y, test_X, test_y  = read_train_test_data(dataset, k)
    fea = FeaExtra(k=k, dataset=dataset)
    train_X1 = []
    test_X1 = []

    for i, j in train_X:
        train_X1.append(fea.get_features(i, j))

    for i, j in test_X:
        test_X1.append(fea.get_features(i, j))

    logistic = linear_model.LogisticRegression()
    logistic.fit(train_X1, train_y)

    pred = logistic.predict(test_X1)
    pred_p = logistic.predict_proba(test_X1)
    pos_ratio =  np.sum(test_y) / test_y.shape[0]
    accuracy =  metrics.accuracy_score(test_y, pred)
    f1_score0 =  metrics.f1_score(test_y, pred)
    f1_score1 =  metrics.f1_score(test_y, pred, average='macro')
    f1_score2 =  metrics.f1_score(test_y, pred, average='micro')

    auc_score =  metrics.roc_auc_score(test_y, pred_p[:, 1])
    print("pos_ratio:", pos_ratio)
    print('accuracy:', accuracy)
    print("f1_score:", f1_score0)
    print("macro f1_score:", f1_score1)
    print("micro f1_score:", f1_score2)
    print("auc score:",auc_score)

    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [24]:
def logistic_embedding9(k=1, dataset='epinions', epoch=10, dirname='sigat'):
    """use sigat embedding to train logistic function
    Returns:
        pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score
    """

    filename = os.path.join(root + "/" + 'embeddings', dirname, 'embedding-{}-{}-{}.npy'.format(dataset, k, epoch))
    embeddings = np.load(filename)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, 'sigat')
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

In [25]:
def logistic_embedding(k=1, dataset='bitcoin_otc', epoch = 10, dirname='sgae'):

    print(epoch, dataset)
    fpath = os.path.join(dirname, 'embedding-{}-{}-{}.npy'.format(dataset, k, epoch))
    embeddings = np.load(fpath)
    pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = common_logistic(dataset, k, embeddings, dirname)
    return pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score

### 模型组件

In [26]:
class Encoder(nn.Module):
    """
    Encode features to embeddings
    """

    def __init__(self, features, feature_dim, embed_dim, adj_lists, aggs):
        super(Encoder, self).__init__()

        self.features = features
        self.feat_dim = feature_dim
        self.adj_lists = adj_lists
        self.aggs = aggs

        self.embed_dim = embed_dim
        for i, agg in enumerate(self.aggs):
            self.add_module('agg_{}'.format(i), agg)
            self.aggs[i] = agg.to(DEVICES)

        def init_weights(m):
            if type(m) == nn.Linear:
                torch.nn.init.kaiming_normal_(m.weight)
                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(m.weight)
                bound = 1 / math.sqrt(fan_in)
                nn.init.uniform_(m.bias, -bound, bound)
        self.nonlinear_layer = nn.Sequential(
                nn.Linear((len(adj_lists) + 1) * feature_dim, feature_dim),  # motifs+本身
                nn.Tanh(),
                nn.Linear(feature_dim, embed_dim)
        )

        self.nonlinear_layer.apply(init_weights)  # 初始化


    def forward(self, nodes):
        """
        Generates embeddings for nodes.
        """

        if not isinstance(nodes, list) and nodes.is_cuda:
            nodes = nodes.data.cpu().numpy().tolist()

        neigh_feats = [agg(nodes, adj, ind) for adj, agg, ind in zip(self.adj_lists, self.aggs, range(len(self.adj_lists)))]
        self_feats = self.features(torch.LongTensor(nodes).to(DEVICES))  # 节点本身motifs的特征
        combined = torch.cat([self_feats] + neigh_feats, 1)  # 邻居+节点本身
        combined = self.nonlinear_layer(combined)
        return combined

        k = self.k(self_feats)

In [27]:
class AttentionAggregator(nn.Module):
    def __init__(self, features, in_dim, out_dim, node_num, dropout_rate=DROUPOUT, slope_ratio=0.1):
        super(AttentionAggregator, self).__init__()

        self.features = features
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.dropout = nn.Dropout(dropout_rate)
        self.slope_ratio = slope_ratio
        self.a = nn.Parameter(torch.FloatTensor(out_dim * 2, 1))  # Whi||Whj
        nn.init.kaiming_normal_(self.a.data)

        self.out_linear_layer = nn.Linear(self.in_dim, self.out_dim)  # W*H
        self.unique_nodes_dict = np.zeros(node_num, dtype=np.int32)


    def forward(self, nodes, adj, ind):
        """
        nodes --- list of nodes in a batch
        adj --- sp.csr_matrix
        """
        node_pku = np.array(nodes)
        edges = np.array(adj[nodes, :].nonzero()).T
        edges[:, 0] = node_pku[edges[:, 0]]  # 将选择后的node对应到之前的node编号上

        unique_nodes_list = np.unique(np.hstack((np.unique(edges), np.array(nodes))))

        batch_node_num = len(unique_nodes_list)
        # this dict can map new i to originial node id
        self.unique_nodes_dict[unique_nodes_list] = np.arange(batch_node_num)  # 选择node的编号

        edges[:, 0] = self.unique_nodes_dict[edges[:, 0]]
        edges[:, 1] = self.unique_nodes_dict[edges[:, 1]]

        n2 = torch.LongTensor(unique_nodes_list).to(DEVICES)  # 第一层涉及到的邻居
        f = self.features(n2)
        new_embeddings = self.out_linear_layer(f)  # self.features(n2)

        original_node_edge = np.array([self.unique_nodes_dict[nodes], self.unique_nodes_dict[nodes]]).T
        edges = np.vstack((edges, original_node_edge))  # 加上自连接的边

        edges = torch.LongTensor(edges).to(DEVICES)

        edge_h_2 = torch.cat((new_embeddings[edges[:, 0], :], new_embeddings[edges[:, 1], :]), dim=1)  # Whi||Whj

        edges_h = torch.exp(F.leaky_relu(torch.einsum("ij,jl->il", [edge_h_2, self.a]), self.slope_ratio))  # attention
        indices = edges
        
        matrix = torch.sparse_coo_tensor(indices.t(), edges_h[:, 0], \
                                         torch.Size([batch_node_num, batch_node_num]), device=DEVICES)
        row_sum = torch.sparse.mm(matrix, torch.ones(size=(batch_node_num, 1)).to(DEVICES))  # attention求和

        results = torch.sparse.mm(matrix, new_embeddings)  # attention和embedding内积

        output_emb = results.div(row_sum)  # 归一化

        return output_emb[self.unique_nodes_dict[nodes]]

In [28]:
class MeanAggregator(nn.Module):
    def __init__(self, features, in_dim, out_dim, node_num):
        super(MeanAggregator, self).__init__()

        self.features = features
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.out_linear_layer = nn.Sequential(
            nn.Linear(self.in_dim, self.out_dim),
            nn.Tanh(),
            nn.Linear(self.out_dim, self.out_dim)
        )

        self.unique_nodes_dict = np.zeros(node_num, dtype=np.int32)

    def forward(self, nodes, adj, ind):
        """

        :param nodes:
        :param adj:
        :return:
        """
        mask = [1, 1, 0, 0]
        node_tmp = np.array(nodes)
        edges = np.array(adj[nodes, :].nonzero()).T
        edges[:, 0] = node_tmp[edges[:, 0]]

        unique_nodes_list = np.unique(np.hstack((np.unique(edges), np.array(nodes))))
        batch_node_num = len(unique_nodes_list)
        self.unique_nodes_dict[unique_nodes_list] = np.arange(batch_node_num)

        ## transform 2 new axis
        edges[:, 0] = self.unique_nodes_dict[edges[:, 0]]
        edges[:, 1] = self.unique_nodes_dict[edges[:, 1]]

        n2 = torch.LongTensor(unique_nodes_list).to(DEVICES)
        new_embeddings = self.out_linear_layer(self.features(n2))
        edges = torch.LongTensor(edges).to(DEVICES)

        values = torch.where(edges[:, 0] == edges[:, 1], torch.FloatTensor([mask[ind]]).to(DEVICES), torch.FloatTensor([1]).to(DEVICES))
        # values = torch.ones(edges.shape[0]).to(DEVICES)
        matrix = torch.sparse_coo_tensor(edges.t(), values, torch.Size([batch_node_num, batch_node_num]), device=DEVICES)
        row_sum = torch.spmm(matrix, torch.ones(size=(batch_node_num, 1)).to(DEVICES))
        row_sum = torch.where(row_sum == 0, torch.ones(row_sum.shape).to(DEVICES), row_sum)

        results = torch.spmm(matrix, new_embeddings)
        output_emb = results.div(row_sum)

        return output_emb[self.unique_nodes_dict[nodes]]

In [29]:
class SDGNN(nn.Module):
    def __init__(self, enc):
        super(SDGNN, self).__init__()
        self.enc = enc
        self.score_function1 = nn.Sequential(  # 定义status ranking
            nn.Linear(EMBEDDING_SIZE1, 1),
            nn.Sigmoid()
        )
        self.score_function2 = nn.Sequential(
            nn.Linear(EMBEDDING_SIZE1, 1),
            nn.Sigmoid()
        )
        self.fc = nn.Linear(EMBEDDING_SIZE1 * 2, 1)

    def forward(self, nodes):
        embeds = self.enc(nodes)
        return embeds

    def criterion(self, nodes, pos_neighbors, neg_neighbors, adj_lists1_1, adj_lists2_1, weight_dict):
        pos_neighbors_list = [set.union(pos_neighbors[i]) for i in nodes]  # pos的邻居
        neg_neighbors_list = [set.union(neg_neighbors[i]) for i in nodes]  # neg的邻居
        unique_nodes_list = list(set.union(*pos_neighbors_list).union(*neg_neighbors_list).union(nodes))  # 所有涉及到的节点
        unique_nodes_dict = {n: i for i, n in enumerate(unique_nodes_list)}  # 节点编号
        nodes_embs = self.enc(unique_nodes_list)

        loss_total = 0
        for index, node in enumerate(nodes):
            z1 = nodes_embs[unique_nodes_dict[node], :]  # 节点的embedding
            pos_neigs = list([unique_nodes_dict[i] for i in pos_neighbors[node]])  # 节点的positive邻居
            neg_neigs = list([unique_nodes_dict[i] for i in neg_neighbors[node]])  # 节点的negative邻居
            pos_num = len(pos_neigs)
            neg_num = len(neg_neigs)

            sta_pos_neighs = list([unique_nodes_dict[i] for i in adj_lists1_1[node]])  # pos:u->v 对应的index
            sta_neg_neighs = list([unique_nodes_dict[i] for i in adj_lists2_1[node]])  # neg:u->v
            # 平衡三角形weight
            pos_neigs_weight = torch.FloatTensor([weight_dict[node][i] for i in adj_lists1_1[node]]).to(DEVICES)  # u->v之间满足平衡三角形weight
            neg_neigs_weight = torch.FloatTensor([weight_dict[node][i] for i in adj_lists2_1[node]]).to(DEVICES)

            if pos_num > 0:
                pos_neig_embs = nodes_embs[pos_neigs, :]  # pos节点embedding
                loss_pku = F.binary_cross_entropy_with_logits(torch.einsum("nj,j->n", [pos_neig_embs, z1]),
                                                              torch.ones(pos_num).to(DEVICES))

                if len(sta_pos_neighs) > 0:
                    sta_pos_neig_embs = nodes_embs[sta_pos_neighs, :]  # pos的u->v连接embedding

                    z11 = z1.repeat(len(sta_pos_neighs), 1)  # 重复n次
                    rs = self.fc(torch.cat([z11, sta_pos_neig_embs], 1)).squeeze(-1)  # Z1||pos_emb -> 1
                    loss_pku += F.binary_cross_entropy_with_logits(rs, torch.ones(len(sta_pos_neighs)).to(DEVICES), \
                                                                   weight=pos_neigs_weight
                                                                   )  # 边类别预测
                    s1 = self.score_function1(z1).repeat(len(sta_pos_neighs), 1)  # z1重复n次
                    s2 = self.score_function2(sta_pos_neig_embs)

                    q = torch.where((s1 - s2) > -0.5, torch.Tensor([-0.5]).repeat(s1.shape).to(DEVICES), s1 - s2)  # !!!! 0.5
                    tmp = (q - (s1 - s2))
                    loss_pku += 5 * torch.einsum("ij,ij->", [tmp, tmp])  # ^2

                loss_total += loss_pku

            if neg_num > 0:
                neg_neig_embs = nodes_embs[neg_neigs, :]
                loss_pku = F.binary_cross_entropy_with_logits(torch.einsum("nj,j->n", [neg_neig_embs, z1]),
                                                              torch.zeros(neg_num).to(DEVICES))
                if len(sta_neg_neighs) > 0:
                    sta_neg_neig_embs = nodes_embs[sta_neg_neighs, :]

                    z12 = z1.repeat(len(sta_neg_neighs), 1)
                    rs = self.fc(torch.cat([z12, sta_neg_neig_embs], 1)).squeeze(-1)

                    loss_pku += F.binary_cross_entropy_with_logits(rs, torch.zeros(len(sta_neg_neighs)).to(DEVICES), \
                                                                   weight=neg_neigs_weight)

                    s1 = self.score_function1(z1).repeat(len(sta_neg_neighs), 1)
                    s2 = self.score_function2(sta_neg_neig_embs)

                    q = torch.where(s1 - s2 > 0.5, s1 - s2, torch.Tensor([0.5]).repeat(s1.shape).to(DEVICES))

                    tmp = (q - (s1 - s2))
                    loss_pku += 5 * torch.einsum("ij,ij->", [tmp, tmp])

                loss_total += loss_pku

        return loss_total

In [30]:
def load_data2(filename=''):
    
    adj_lists1 = defaultdict(set)
    adj_lists1_1 = defaultdict(set)
    adj_lists1_2 = defaultdict(set)
    adj_lists2 = defaultdict(set)
    adj_lists2_1 = defaultdict(set)
    adj_lists2_2 = defaultdict(set)
    adj_lists3 = defaultdict(set)

    with open(filename) as fp:
        for i, line in enumerate(fp):
            info = line.strip().split()
            person1 = int(info[0])
            person2 = int(info[1])
            v = int(info[2])
            adj_lists3[person2].add(person1)
            adj_lists3[person1].add(person2)

            if v == 1:
                adj_lists1[person1].add(person2)
                adj_lists1[person2].add(person1)

                adj_lists1_1[person1].add(person2)
                adj_lists1_2[person2].add(person1)
            else:
                adj_lists2[person1].add(person2)
                adj_lists2[person2].add(person1)

                adj_lists2_1[person1].add(person2)
                adj_lists2_2[person2].add(person1)

    return adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3

In [31]:
def run(dataset, k):
    num_nodes = DATASET_NUM_DIC[dataset] + 3

    # adj_lists1, adj_lists2, adj_lists3 = load_data(k, dataset)
    filename = root + '/experiment-data/{}-train-{}.edgelist'.format(dataset, k)
    adj_lists1, adj_lists1_1, adj_lists1_2, adj_lists2, adj_lists2_1, adj_lists2_2, adj_lists3 = load_data2(filename)
    print(k, dataset, 'data load!')

    features = nn.Embedding(num_nodes, NODE_FEAT_SIZE)
    features.weight.requires_grad = True

    features = features.to(DEVICES)
    # 有向的四个边; 4个motifs
    adj_lists = [adj_lists1_1, adj_lists1_2,  adj_lists2_1, adj_lists2_2]

  
    weight_dict = defaultdict(dict)
    fea_model = FeaExtra(dataset=dataset, k=k)
    # # u -> v
    for i in adj_lists1_1:
        for j in adj_lists1_1[i]:
            v_list1 = fea_model.feature_part2(i, j)
            mask = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1]
            counts1 = np.dot(v_list1, mask)  # 构造balance三角形，在后面计算loss时候使用;
            weight_dict[i][j] = counts1

    for i in adj_lists2_1:
        for j in adj_lists2_1[i]:
            v_list1 = fea_model.feature_part2(i, j)
            mask = [0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0]

            counts1 = np.dot(v_list1, mask)
            weight_dict[i][j] = counts1

    adj_lists = adj_lists


    print(len(adj_lists), 'motifs')

    def func(adj_list):
        edges = []
        for a in adj_list:
            for b in adj_list[a]:
                edges.append((a, b))
        edges = np.array(edges)
        adj = sp.csr_matrix((np.ones(len(edges)), (edges[:, 0], edges[:, 1])), shape=(num_nodes, num_nodes))
        return adj

    if args.agg == 'mean':
        aggregator = MeanAggregator
    else:
        aggregator = AttentionAggregator

    adj_lists = list(map(func, adj_lists))
    aggs = [aggregator(features, NODE_FEAT_SIZE, NODE_FEAT_SIZE, num_nodes) for adj in adj_lists]  # 构建motifs下节点的邻居聚合
    enc1 = Encoder(features, NODE_FEAT_SIZE, EMBEDDING_SIZE1, adj_lists, aggs)  # 4个motifs+本身节点 + cls => embedding
    enc1 = enc1.to(DEVICES)


    aggs2 = [aggregator(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1, num_nodes) for _ in adj_lists]
    enc2 = Encoder(lambda n: enc1(n), EMBEDDING_SIZE1, EMBEDDING_SIZE1, adj_lists, aggs2)

    model = SDGNN(enc2)
    model = model.to(DEVICES)

    # print(model.train())
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        list(model.parameters()) + list(enc1.parameters()) \
                                        + list(features.parameters())),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY
                                 )

    for epoch in range(EPOCHS + 2):
        total_loss = []
        if epoch % INTERVAL_PRINT == 1:
            model.eval()
            all_embedding = np.zeros((NUM_NODE, EMBEDDING_SIZE1))
            for i in range(0, NUM_NODE, BATCH_SIZE):
                begin_index = i
                end_index = i + BATCH_SIZE if i + BATCH_SIZE < NUM_NODE else NUM_NODE
                values = np.arange(begin_index, end_index)
                embed = model.forward(values.tolist())
                embed = embed.data.cpu().numpy()
                all_embedding[begin_index: end_index] = embed

            fpath = os.path.join(OUTPUT_DIR, 'embedding-{}-{}-{}.npy'.format(dataset, k, str(epoch)))
            np.save(fpath, all_embedding)
            pos_ratio, accuracy, f1_score0, f1_score1, f1_score2, auc_score = logistic_embedding(k=k, dataset=dataset,
                                                                                                 epoch=epoch,
                                                                                                dirname=OUTPUT_DIR)
            model.train()

        time1 = time.time()
        nodes_pku = np.random.permutation(NUM_NODE).tolist()
        for batch in range(NUM_NODE // BATCH_SIZE):
            optimizer.zero_grad()
            b_index = batch * BATCH_SIZE
            e_index = (batch + 1) * BATCH_SIZE
            nodes = nodes_pku[b_index:e_index]

            loss = model.criterion(
                nodes, adj_lists1, adj_lists2, adj_lists1_1, adj_lists2_1, weight_dict  # i,j节点的平衡三角形数量
            )
            total_loss.append(loss.data.cpu().numpy())

            loss.backward()
            optimizer.step()
        print(f'epoch: {epoch}, loss: {np.mean(total_loss)}, time: {time.time()-time1}')

In [34]:
def main():
    print('NUM_NODE', NUM_NODE)
    print('WEIGHT_DECAY', WEIGHT_DECAY)
    print('NODE_FEAT_SIZE', NODE_FEAT_SIZE)
    print('EMBEDDING_SIZE1', EMBEDDING_SIZE1)
    print('LEARNING_RATE', LEARNING_RATE)
    print('BATCH_SIZE', BATCH_SIZE)
    print('EPOCHS', EPOCHS)
    print('DROUPOUT', DROUPOUT)
    
    print("--" * 20)
    
    dataset = args.dataset
    run(dataset=dataset, k=K)

In [37]:
main()

NUM_NODE 3783
WEIGHT_DECAY 0.001
NODE_FEAT_SIZE 20
EMBEDDING_SIZE1 20
LEARNING_RATE 0.001
BATCH_SIZE 500
EPOCHS 100
DROUPOUT 0.0
----------------------------------------
1 bitcoin_alpha data load!
4 motifs


  super(Adam, self).__init__(params, defaults)


epoch: 0, loss: 2969.317138671875, time: 17.703672170639038
1 bitcoin_alpha
epoch: 1, loss: 2110.322998046875, time: 17.17906427383423
epoch: 2, loss: 1721.8831787109375, time: 17.487241744995117
3 bitcoin_alpha


KeyboardInterrupt: 