### 使用在二部图中
- 【二部图：共两类节点，分为显式和隐式，显式是两点直接相连，隐式是两点通过中间节点相连】

In [4]:
import random
import os
import sys
import math
import networkx as nx
import numpy as np
import itertools
import pandas as pd

from io import open
from os import path
from glob import glob
from random import shuffle
from sklearn import preprocessing, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score,auc,precision_recall_fscore_support

from six.moves import range, zip, zip_longest
from six import iterkeys
from collections import defaultdict, Iterable
from networkx.algorithms import bipartite as bi


from multiprocessing import cpu_count
from multiprocessing import Pool

from itertools import product, permutations
from concurrent.futures import ProcessPoolExecutor

from scipy.io import loadmat
from scipy.sparse import issparse

from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from datasketch import MinHashLSHForest, MinHash, MinHashLSH

### 数据预处理

In [3]:
class DataUtils(object):
    def __init__(self, model_path):
        self.model_path = model_path
        
    def rename(self, datafile):
        # 区分两种节点并重命名
        with open(os.path.join(self.model_path, "_ratings.dat"), "w") as fw:
            with open(datafile, "r", encoding="UTF-8") as fin:
                line = fin.readline()
                while line:
                    user, item, rating = line.strip().split("\t")
                    fw.write("u" + user + "\t" + "i" + item + "\t" + rating + "\n")
                    line = fin.readline()
                    
    def split_data(self, percent):
        test_user, test_item, test_rate, rating = set(), set(), {}, {}
        with open(os.path.join(self.model_path, "ratings.dat"), "r") as fin, open(os.path.join(self.model_path, "ratings_train.dat"), "w") as ftrain, open(os.path.join(self.model_path, "ratings_test.dat"), "w") as ftest:
            for line in fin.readlines():
                user, item, rate = line.strip().split("\t")
                if rating.get(user) is None:
                    rating[user] = {}
                rating[user][item] = rate
            for u in rating.keys():
                item_list = rating[u].keys()
                sample_list = random.sample(item_list, int(len(item_list) * percent))
                for item in item_list:
                    if item in sample_list:
                        ftrain.write(u + "\t" + item + "\t" + rating[u][item] + "\n")
                    else:
                        if test_rate.get(u) is None:
                            test_rate[u] = {}
                        test_rate[u][item] = float(rating[u][item])
                        test_user.add(u)
                        test_item.add(item)
                        ftest.write(u + "\t" + item + "\t" + rating[u][item] + "\n")
                        
        return test_user, test_item, test_rate
    
    def read_data(self, filename=None):
        if filename is None:
            filename = os.path.join(self.model_path, "ratings_test.dat")
        users, items, rates = set(), set(), {}
        with open(filename, "r", encoding="UTF-8") as fin:
            line = fin.readline()
            while line:
                user, item, rate = line.strip().split()
                if rates.get(user) is None:
                    rates[user] = {}
                rates[user][item] = float(rate)
                users.add(user)
                items.add(item)
                line = fin.readline()
                
        return users, items, rates

#### 负采样操作

In [4]:
def construct_lsh(obj_dict):
    lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
    lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
    # forest = MinHashLSHForest(num_perm=128)
    keys = list(obj_dict.keys())
    values = list(obj_dict.values())
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        lsh_0.insert(keys[i], temp)
        lsh_5.insert(keys[i], temp)
    return lsh_0,lsh_5, keys, ms

In [5]:
def call_get_negs_by_lsh(sample_num, obj_dict):
    lsh_0,lsh_5, keys, ms = construct_lsh(obj_dict)
    visited = []
    negs_dict = {}
    for i in range(len(keys)):
        record = []
        if i in visited:
            continue
        visited.append(i)
        record.append(i)
        total_list = set(keys)
        sim_list = set(lsh_0.query(ms[i]))
        high_sim_list = set(lsh_5.query(ms[i]))
        total_list = list(total_list - sim_list)
        for j in high_sim_list:
            total_list = set(total_list)
            ind = keys.index(j)
            if ind not in visited:
                visited.append(ind)
                record.append(ind)
            sim_list_child = set(lsh_0.query(ms[ind]))
            total_list = list(total_list - sim_list_child)
        total_list = random.sample(list(total_list), min(sample_num, len(total_list)))
        for j in record:
            key = keys[j]
            negs_dict[key] = total_list
    return negs_dict

In [6]:
def get_negs_by_lsh(user_dict, item_dict, num_negs):
    sample_num_u = max(300, int(len(user_dict) * 0.01 * num_negs))
    sample_num_v = max(300, int(len(user_dict) * 0.01 * num_negs))
    negs_u = call_get_negs_by_lsh(sample_num_u, user_dict)  # u类别下，负采样工作;每个节点进行600个负采样节点
    negs_v = call_get_negs_by_lsh(sample_num_v, item_dict)  # v类别下，负采样工作;每个节点进行300个负采样节点
    return negs_u, negs_v

In [7]:
class Graph(defaultdict):
    act = {}
    isWeight = False
    
    def __init__(self):
        super(Graph, self).__init__(list)

    def setIsWeight(self,isWeight):
        self.isWeight = isWeight

    def initAct(self):
        for i in self.keys():
            self.act[i] = 0

    def nodes(self):
        return self.keys()

    def adjacency_iter(self):
        return self.iteritems()

    def subgraph(self, nodes={}):
        subgraph = Graph()
    
        for n in nodes:
            if n in self:
                subgraph[n] = [x for x in self[n] if x in nodes]
        
        return subgraph

    def make_undirected(self):

        for v in self.keys():
            for other in self[v]:
                if v != other:
                    self[other].append(v)
                    
        self.make_consistent()
        
        return self

    def make_consistent(self):

        if self.isWeight == True:
            for k in iterkeys(self):
                self[k] = self.sortedDictValues(self[k])
                self.remove_self_loops_dict()
        
        else:
            for k in iterkeys(self):
                self[k] = list(sorted(set(self[k])))

        self.remove_self_loops()

        return self
    
    def sortedDictValues(self,adict):
        keys = adict.keys()
        keys.sort()

        return map(adict.get, keys)

    def make_consistent_dict(self):
        for k in iterkeys(self):
            self[k] = self.sortedDictValues(self[k])
        self.remove_self_loops_dict()
        
        return self

    def remove_self_loops(self):
        removed = 0
        
        if self.isWeight == True:
            for x in self:
                if x in self[x].keys():
                    del self[x][x]
                    removed += 1
                    
        else:
            for x in self:
                if x in self[x]:
                    self[x].remove(x)
                    removed += 1
    
        return self

    def check_self_loops(self):
        for x in self:
            for y in self[x]:
                if x == y:
                    return True
        return False

    def has_edge(self, v1, v2):
        if v2 in self[v1] or v1 in self[v2]:
            return True
        return False

    def degree(self, nodes=None):
        if isinstance(nodes, Iterable):
            return {v:len(self[v]) for v in nodes}
        else:
            return len(self[nodes])

    def order(self):
        "Returns the number of nodes in the graph"
        return len(self)    

    def number_of_edges(self):
        "Returns the number of nodes in the graph"
        return sum([self.degree(x) for x in self.keys()])/2

    def number_of_nodes(self):
        "Returns the number of nodes in the graph"
        return self.order()

    def random_walk(self, nodes, path_length, alpha=0, rand=random.Random(), start=None):
        """ 
        Returns a truncated random walk.

        path_length: Length of the random walk.
        alpha: probability of restarts.
        start: the start node of the random walk.
        """
        G = self
        if start:
            path = [start]
        else:  # Sampling is uniform w.r.t V, and not w.r.t E
            path = [rand.choice(nodes)]
            
        while len(path) < path_length:
            cur = path[-1]
            if len(G[cur]) > 0:
                if rand.random() >= alpha:
                    add_node = rand.choice(G[rand.choice(G[cur])])
                    while add_node == cur:
                        add_node = rand.choice(G[rand.choice(G[cur])])
                    path.append(add_node)
                else:
                    path.append(path[0])
            else:
                break
        return path

#     def spreading_activation(self, path_length, alpha=0, rand=random.Random(), start=None):
#         """ 
#         Returns a truncated random walk.
#         path_length: Length of the random walk.
#         alpha: probability of restarts.
#         start: the start node of the random walk.
#         """
#         G = self
#         if start:
#             path = [str(start)]
#         else:  # Sampling is uniform w.r.t V, and not w.r.t E
#             path = [rand.choice(G.keys())]

#         while len(path) < path_length:
#             cur = path[-1]
#             if len(G[cur]) > 0:
#                 if rand.random() >= alpha:
#                     temp = rand.choice(G[cur])
#                     while
#                     path.append(rand.choice(G[cur]))
#                 else:
#                     path.append(path[0])
#             else:
#                 break
#         return path

    def random_walk_restart(self, nodes, percentage, alpha=0, rand=random.Random(), start=None):
        """
        Returns a truncated random walk.
        percentage: probability of stopping walking
        alpha: probability of restarts.
        start: the start node of the random walk.
        """
        G = self
        if start:
            path = [start]
        else:  # Sampling is uniform w.r.t V, and not w.r.t E
            path = [rand.choice(nodes)]
        
        while len(path) < 1 or random.random() > percentage:
            cur = path[-1]
            if len(G[cur]) > 0:  # 节点的邻居节点
                if rand.random() >= alpha:
                    add_node = rand.choice(G[cur])
                    while add_node == cur:  # 如果是同一个节点，重新选择；前面我们已经去重自连接边了；
                        add_node = rand.choice(G[cur])
                    path.append(add_node)
                else:
                    path.append(path[0])
            else:
                break
        return path
  
      # neighbors = []
      # for n in G[cur]:
      #   neighbors.extend(G[n])
      # if len(G[cur]) > 0:
      #   if rand.random() >= alpha:
      #     add_node = rand.choice(neighbors)
      #     path.append(add_node)
      #   else:
      #     path.append(path[0])
      # else:
      #   break
    # return path

    def random_walk_restart_for_large_bipartite_graph(self, nodes, percentage, alpha=0, rand=random.Random(), start=None):
        """
        Returns a truncated random walk.
        percentage: probability of stopping walking
        alpha: probability of restarts.
        start: the start node of the random walk.
        """
        G = self
        if start:
            path = [start]
        else:  # Sampling is uniform w.r.t V, and not w.r.t E
            path = [rand.choice(nodes)]
            
        while len(path) < 1 or random.random() > percentage:
            cur = path[-1]
            neighbors = set([])
            for nei in G[cur]:
                neighbors = neighbors.union(set(G[nei]))
            # print(len(neighbors))
            neighbors = list(neighbors)
            
            if len(G[cur]) > 0:
                if rand.random() >= alpha:
                    add_node = rand.choice(neighbors)
                    while add_node == cur and len(neighbors) > 1:
                        add_node = rand.choice(neighbors)
                    path.append(add_node)
                else:
                    path.append(path[0])
            else:
                break
        return path

In [8]:
def calculateAct(self, node):
    G = self

- TODO add build_walks in here

In [9]:
def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, rand=random.Random(), node_type="u"):
    walks = []
    nodes_total = list(G.nodes())
    nodes = []
    
    for obj in nodes_total:
        if obj[0] == node_type:
            nodes.append(obj)
            
    # nodes = list(G.nodes())
    
    for cnt in range(num_paths):
        rand.shuffle(nodes)
        for node in nodes:
            walks.append(G.random_walk(nodes, path_length, alpha=alpha, rand=rand, start=node))
            
    random.shuffle(walks)
    
    return walks

In [10]:
def build_deepwalk_corpus_random(G, hits_dict, percentage, maxT, minT, alpha=0, rand=random.Random()):
    walks = []
    nodes = list(G.nodes())
    for node in nodes:
        num_paths = max(int(math.ceil(maxT * hits_dict[node])), minT)  # hits确定随机游走的次数
        # print(num_paths)
        for cnt in range(num_paths):
            walks.append(G.random_walk_restart(nodes, percentage,rand=rand, alpha=alpha, start=node))  # 按照stop_proba进行采样
    
    random.shuffle(walks)
    return walks

In [11]:
def build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage, maxT, minT, alpha=0, rand = random.Random(), node_type='u'):
    walks = []
    nodes_total = list(G.nodes())
    nodes = []
    # print(len(nodes), node_type)
    for obj in nodes_total:
        if obj[0] == node_type:
            nodes.append(obj)
    # cnt_0 = 1
    # print(len(nodes))
    for node in nodes:
        """
        if cnt_0 % 1000 == 0:
            print(cnt_0)
            cnt_0 += 1
        """
        num_paths = max(int(math.ceil(maxT * hits_dict[node])), minT)
        # print(num_paths)
        for cnt in range(num_paths):
            walks.append(G.random_walk_restart_for_large_bipartite_graph(nodes, percentage,rand=rand, alpha=alpha, start=node))
    random.shuffle(walks)
    
    return walks

In [12]:
def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0, rand=random.Random(0)):
    walks = []
    nodes = list(G.nodes())
    
    for cnt in range(num_paths):
        rand.shuffle(nodes)
        for node in nodes:
            yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node)

In [13]:
def clique(size):
    return from_adjlist(permutations(range(1, size + 1)))

In [14]:
# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)

In [15]:
def parse_adjacencylist(f):
    adjlist = []
    for l in f:
        if l and l[0] != "#":
            introw = [int(x) for x in l.strip().split()]
            row = [introw(0)]
            row.extend(set(sorted(introw[1:])))
            adjlist.extend([row])
            
    return adjlist

In [16]:
def parse_adjacencylist_unchecked(f):
    adjlist = []
    for l in f:
        if l and l[0] != "#":
            adjlist.extend([[int(x) for x in l.strip().split()]])
            
    return adjlist

In [17]:
def from_adjlist_unchecked(adjlist):
    G = Graph()
    
    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = neighbors

    return G

In [18]:
def from_adjlist(adjlist):
    G = Graph()
    
    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = list(sorted(set(neighbors)))

    return G

In [19]:
def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
    if unchecked:
        parse_func = parse_adjacencylist_unchecked
        convert_func = from_adjlist_unchecked
    else:
        parse_func = parse_adjacencylist
        convert_func = from_adjlist
    
    adjlist = []
    
    with open(file_) as f:
        with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
            total = 0
            for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))):
                adjlist.extend(adj_chunk)
                total += len(adj_chunk)
                    
    G = convert_func(adjlist)
    
    if undirected:
        G = G.make_undirected()
    
    return G

In [20]:
def load_edgelist(file_, undirected=True):
    G = Graph()
    with open(file_,encoding="UTF-8") as f:
        for l in f:
            x, y = l.strip().split()[:2]
            G[x].append(y)
            if undirected:
                G[y].append(x)
    G.make_consistent()
    return G

In [21]:
def load_edgelist_from_matrix(matrix, undirected=True):
    G = Graph()
    for x in matrix.keys():
        for y in matrix[x]:
            G[x].append(y)
            if undirected:
                G[y].append(x)
    G.make_consistent()
    return G

In [22]:
def load_edgelist_w(file_, undirected=True):
    G = Graph()
    G.setIsWeight(True)
    G.initAct()
    with open(file_) as f:
        for l in f:
            x, y, w = l.strip().split()[:3]
            x = int(x)
            y = int(y)
            w = float(w)
            
            if len(G[x]) == 0:
                G[x] = {}
            if len(G[y]) == 0:
                G[y] = {}
            G[x][y] = w
            
            if undirected:
                G[y][x] = w
                
    G.make_consistent()
    
    return G    

In [23]:
def load_matfile(file_, variable_name="network", undirected=True):
    mat_varables = loadmat(file_)
    mat_matrix = mat_varables[variable_name]

    return from_numpy(mat_matrix, undirected)

In [24]:
def from_networkx(G_input, undirected=True):
    G = Graph()

    for idx, x in enumerate(G_input.nodes_iter()):
        for y in iterkeys(G_input[x]):
            G[x].append(y)

    if undirected:
        G.make_undirected()

    return G

In [25]:
def from_numpy(x, undirected=True):
    G = Graph()

    if issparse(x):
        cx = x.tocoo()
        for i,j,v in zip(cx.row, cx.col, cx.data):
            G[i].append(j)
    else:
        raise Exception("Dense matrices not yet supported.")

    if undirected:
        G.make_undirected()

    G.make_consistent()
    return G

In [26]:
class GraphUtils(object):
    def __init__(self, model_path):
        self.model_path = model_path
        self.G = nx.Graph()
        self.edge_dict_u = {}
        self.edge_dict_v = {}
        self.edge_list = []
        self.node_u = []
        self.node_v = []
        self.authority_u, self.authority_v = {}, {}
        self.walks_u, self.walks_v = [], []
        self.G_u, self.G_v = None, None
        self.fw_u = os.path.join(self.model_path, "homogeneous_u.dat")
        self.fw_v = os.path.join(self.model_path, "homogeneous_v.dat")
        self.negs_u = {}
        self.negs_v = {}
        self.context_u = {}
        self.context_v = {}
        
    def construct_training_graph(self, filename=None):  # 构造训练图
        if filename is None:
            filename = os.path.join(self.model_path, "ratings_train.dat")
        edge_list_u_v = []
        edge_list_v_u = []
        with open(filename, encoding="UTF-8") as fin:
            line = fin.readline()
            while line:
                user, item, rating = line.strip().split("\t")
                if self.edge_dict_u.get(user) is None:  # 构造u连接到的节点
                    self.edge_dict_u[user] = {}
                if self.edge_dict_v.get(item) is None:  # 构造v连接到的节点
                    self.edge_dict_v[item] = {}
                edge_list_u_v.append((user, item, float(rating)))  # list边
                self.edge_dict_u[user][item] = float(rating)  # 记录点边
                self.edge_dict_v[item][user] = float(rating)
                edge_list_v_u.append((item, user, float(rating)))
                line = fin.readline()
        # create bipartite graph
        self.node_u = self.edge_dict_u.keys()  # u类别的所有节点
        self.node_v = self.edge_dict_v.keys()  # v类别的所有节点
        # self.node_u.sort()
        # self.node_v.sort()
        self.G.add_nodes_from(self.node_u, bipartite=0)  # 构建二部图
        self.G.add_nodes_from(self.node_v, bipartite=1)  
        self.G.add_weighted_edges_from(edge_list_u_v + edge_list_v_u)  # 将异构图边信息导入
        self.edge_list = edge_list_u_v
        
    def calculate_centrality(self, mode="hits"):
        if mode == "degree_centrality":
            a = nx.degree_centrality(self.G)
        else:
            h, a = nx.hits(self.G)  # hub, authority
            
        max_a_u, min_a_u, max_a_v, min_a_v = 0, 100000, 0, 100000
        
        for node in self.G.nodes():  # u,v类别的authority的值
            if node[0] == "u":
                if max_a_u < a[node]:
                    max_a_u = a[node]
                if min_a_u > a[node]:
                    min_a_u = a[node]
            if node[0] == "i":
                if max_a_v < a[node]:
                    max_a_v = a[node]
                if min_a_v > a[node]:
                    min_a_v = a[node]
        
        for node in self.G.nodes():  # 计算每个节点归一化后的authority值
            if node[0] == "u":
                if max_a_u-min_a_u != 0:
                    self.authority_u[node] = (float(a[node])-min_a_u) / (max_a_u-min_a_u)
                else:
                    self.authority_u[node] = 0
            if node[0] == 'i':
                if max_a_v-min_a_v != 0:
                    self.authority_v[node] = (float(a[node])-min_a_v) / (max_a_v-min_a_v)
                else:
                    self.authority_v[node] = 0
                    
    def homogeneous_graph_random_walks(self, percentage, maxT, minT):
        # print(len(self.node_u),len(self.node_v))
        A = bi.biadjacency_matrix(self.G, self.node_u, self.node_v, dtype=np.float,weight='weight', format='csr')  # 构造二部图邻接矩阵
        row_index = dict(zip(self.node_u, itertools.count()))  # 节点:编号 dictionary
        col_index = dict(zip(self.node_v, itertools.count()))
        index_row = dict(zip(row_index.values(), row_index.keys()))
        index_item = dict(zip(col_index.values(), col_index.keys()))
        AT = A.transpose()
        self.save_homogenous_graph_to_file(A.dot(AT),self.fw_u, index_row,index_row)  # 将矩阵保留下来
        self.save_homogenous_graph_to_file(AT.dot(A),self.fw_v, index_item,index_item)
        self.G_u, self.walks_u = self.get_random_walks_restart(self.fw_u, self.authority_u, percentage=percentage, maxT=maxT, minT=minT)  # 每个类别节点下的随机游走
        self.G_v, self.walks_v = self.get_random_walks_restart(self.fw_v, self.authority_v, percentage=percentage, maxT=maxT, minT=minT)
        
    def get_random_walks_restart(self, datafile, hits_dict, percentage, maxT, minT):
        if datafile is None:
            datafile = os.path.join(self.model_path, "rating_train.dat")
        G = load_edgelist(datafile, undirected=True)  # 导入点边信息
        print("number of nodes: {}".format(len(G.nodes())))
        print("walking...")
        walks = build_deepwalk_corpus_random(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0)
        print("walking...ok")
        return G, walks
    
    def homogeneous_graph_random_walks_for_large_bipartite_graph(self, percentage, maxT, minT):
        A = bi.biadjacency_matrix(self.G, self.node_u, self.node_v, dtype=np.float,weight='weight', format='csr')
        row_index = dict(zip(self.node_u, itertools.count()))
        col_index = dict(zip(self.node_v, itertools.count()))
        index_row = dict(zip(row_index.values(), row_index.keys()))
        index_item = dict(zip(col_index.values(), col_index.keys()))
        AT = A.transpose()
        matrix_u = self.get_homogenous_graph(A.dot(AT), self.fw_u, index_row, index_row)
        matrix_v = self.get_homogenous_graph(AT.dot(A), self.fw_v, index_item, index_item)
        self.G_u, self.walks_u = self.get_random_walks_restart_for_large_bipartite_graph(matrix_u, self.authority_u, percentage=percentage, maxT=maxT, minT=minT)
        self.G_v, self.walks_v = self.get_random_walks_restart_for_large_bipartite_graph(matrix_v, self.authority_v, percentage=percentage, maxT=maxT, minT=minT)
        
    def homogeneous_graph_random_walks_for_large_bipartite_graph_without_generating(self, datafile, percentage, maxT, minT):
        self.G_u, self.walks_u = self.get_random_walks_restart_for_large_bipartite_graph_without_generating(datafile, self.authority_u, percentage=percentage, maxT=maxT, minT=minT, node_type='u')
        self.G_v, self.walks_v = self.get_random_walks_restart_for_large_bipartite_graph_without_generating(datafile, self.authority_v, percentage=percentage, maxT=maxT, minT=minT,node_type='i')
        
    def get_random_walks_restart_for_large_bipartite_graph(self, matrix, hits_dict, percentage, maxT, minT):
        G = load_edgelist_from_matrix(matrix, undirected=True)
        print("number of nodes: {}".format(len(G.nodes())))
        print("walking...")
        walks = build_deepwalk_corpus_random(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0)
        print("walking...ok")
        return G, walks       
        
    def get_random_walks_restart_for_large_bipartite_graph_without_generating(self, datafile, hits_dict, percentage, maxT, minT, node_type='u'):
        if datafile is None:
            datafile = os.path.join(self.model_path,"rating_train.dat")
        G = load_edgelist(datafile, undirected=True)
        cnt = 0
        for n in G.nodes():
            if n[0] == node_type:
                cnt += 1
        print("number of nodes: {}".format(cnt))
        print("walking...")
        walks = build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0,node_type=node_type)
        # print(walks)
        print("walking...ok")
        return G, walks
    
    def save_words_and_sentences_to_file(self, filenodes, filesentences):
        with open(filenodes, "w") as fw:
            for node in self.G.keys():
                fw.write(node + "\n")

        with open(filesentences, "w") as fs:
            for nodes in self.walks:
                for index in range(0, len(nodes)):
                    if index == len(nodes) - 1:
                        fs.write(nodes[index] + "\n")
                    else:
                        fs.write(nodes[index] + " ")
                        
    def get_negs(self, num_negs):
        self.negs_u, self.negs_v = get_negs_by_lsh(self.edge_dict_u, self.edge_dict_v, num_negs)
        # print(len(self.negs_u), len(self.negs_v))
        return self.negs_u, self.negs_v
    
        
    def get_context_and_fnegatives(self, G, walks,win_size, num_negs,table):
        # generate context and negatives
        if isinstance(G, Graph):
            node_list = G.nodes()
        elif isinstance(G, list):
            node_list = G
        word2id = {}
        for i in range(len(node_list)):
            word2id[node_list[i]] = i + 1
        walk_list = walks
        print("context...")
        context_dict = {}
        new_neg_dict = {}
        for step in range(len(walk_list)):

            walk = walk_list[step % len(walk_list)]
            # print(walk)
            batch_labels = []
            # travel each walk
            for iter in range(len(walk)):
                start = max(0, iter - win_size)
                end = min(len(walk), iter + win_size + 1)
                # index: index in window
                if context_dict.get(walk[iter]) is None:
                    context_dict[walk[iter]] = []
                    new_neg_dict[walk[iter]] = []
                labels_list = []
                neg_sample = []
                for index in range(start, end):
                    labels_list.append(walk[index])
                while len(neg_sample) < num_negs:
                    sa = random.choice(range(len(node_list)))
                    if table[sa] in labels_list:
                        continue
                    neg_sample.append(table[sa])
                context_dict[walk[iter]].append(labels_list)
                new_neg_dict[walk[iter]].append(neg_sample)
            if len(batch_labels) == 0:
                continue
        print("context...ok")
        return context_dict, new_neg_dict
        
    def get_context_and_negatives(self,G,walks,win_size,num_negs,negs_dict):
        # generate context and negatives
        if isinstance(G, Graph):
            node_list = G.nodes()
        elif isinstance(G, list):
            node_list = G
        node_list = list(node_list)
        word2id = {}
        for i in range(len(node_list)):  # 每个节点给一个编号
            word2id[node_list[i]] = i + 1
        walk_list = walks
        print("context...")
        context_dict = {}  # 上下文
        new_neg_dict = {}  # 负采样
        for step in range(len(walk_list)):
            walk = walk_list[step % len(walk_list)]  # 多余的步骤; 选择游走序列
            # print(walk)
            # travel each walk
            for iter in range(len(walk)):  # walk[iter] 中心词
                start = max(0, iter - win_size)
                end = min(len(walk), iter + win_size + 1)
                # index: index in window
                if context_dict.get(walk[iter]) is None:
                    context_dict[walk[iter]] = []
                    new_neg_dict[walk[iter]] = []
                labels_list = []
                negs = negs_dict[walk[iter]]   # 负采样词
                for index in range(start, end):
                    if walk[index] in negs:
                        negs.remove(walk[index])  # 在负采样中，移除负采样中该节点
                    if walk[index] == walk[iter]:  # 中心词本身，不是上线文节点;
                        continue
                    else:
                        labels_list.append(walk[index])  # 选择上线文节点
                neg_sample = random.sample(negs, min(num_negs, len(negs)))  # 随机负采样4个节点
                context_dict[walk[iter]].append(labels_list)  # 上线文节点，训练样本
                new_neg_dict[walk[iter]].append(neg_sample)  # 负采样节点，负样本
        print("context...ok")
        return context_dict, new_neg_dict  # 上下文节点; 负采样节点;
    
    def save_homogenous_graph_to_file(self, A, datafile, index_row, index_item):
        (M, N) = A.shape
        csr_dict = A.__dict__
        data = csr_dict.get("data")
        indptr = csr_dict.get("indptr")
        indices = csr_dict.get("indices")
        col_index = 0
        with open(datafile, 'w') as fw:
            for row in range(M):
                for col in range(indptr[row], indptr[row + 1]):
                    r = row
                    c = indices[col]
                    fw.write(index_row.get(r) + "\t" + index_item.get(c) + "\t" + str(data[col_index]) + "\n")
                    col_index += 1
                    
    def get_homogenous_graph(self, A, datafile, index_row, index_item):
        (M, N) = A.shape
        csr_dict = A.__dict__
        data = csr_dict.get("data")
        indptr = csr_dict.get("indptr")
        indices = csr_dict.get("indices")
        col_index = 0
        matrix = {}
        with open(datafile, 'w') as fw:
            for row in range(M):
                for col in range(indptr[row], indptr[row+1]):
                    r = index_row.get(row)
                    c = index_item.get(indices[col])
                    if matrix.get(r) is None:
                        matrix[r] = []
                    matrix[r].append(c)
                    col_index += 1

        return matrix
    
    def read_sentences_and_homogeneous_graph(self, filesentences=None, datafile=None):
        G = load_edgelist(datafile, undirected=True)
        walks = []
        with open(filesentences, "r") as fin:
            for line in fin.readlines():
                walk = line.strip().split(" ")
                walks.append(walk)
        return G, walks

In [27]:
def init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args):
    # user
    for i in node_u:
        vectors = np.random.random([1, args.d])
        help_vectors = np.random.random([1, args.d])
        node_list_u[i] = {}
        node_list_u[i]['embedding_vectors'] = preprocessing.normalize(vectors, norm='l2')
        node_list_u[i]['context_vectors'] = preprocessing.normalize(help_vectors, norm='l2')
    # item
    for i in node_v:
        vectors = np.random.random([1, args.d])
        help_vectors = np.random.random([1, args.d])
        node_list_v[i] = {}
        node_list_v[i]['embedding_vectors'] = preprocessing.normalize(vectors, norm='l2')
        node_list_v[i]['context_vectors'] = preprocessing.normalize(help_vectors, norm='l2')

In [28]:
def walk_generator(gul, args):
    gul.calculate_centrality(args.mode)  # HITS计算节点的中心性
    if args.large == 0:  # percentage:stop proba; maxT:最大长度; minT:最短长度
        gul.homogeneous_graph_random_walks(percentage=args.p, maxT=args.maxT, minT=args.minT)
    elif args.large == 1:
        gul.homogeneous_graph_random_walks_for_large_bipartite_graph(percentage=args.p, maxT=args.maxT, minT=args.minT)
    elif args.large == 2:
        gul.homogeneous_graph_random_walks_for_large_bipartite_graph_without_generating(datafile=args.train_data,percentage=args.p,maxT=args.maxT, minT=args.minT)
        
    return gul

In [29]:
def get_context_and_negative_samples(gul, args):
    if args.large == 0:
        neg_dict_u, neg_dict_v = gul.get_negs(args.ns)  # 负采样方法
        print("negative samples is ok...")
        context_dict_u, neg_dict_u = gul.get_context_and_negatives(gul.G_u, gul.walks_u, args.ws, args.ns, neg_dict_u)
        context_dict_v, neg_dict_v = gul.get_context_and_negatives(gul.G_v, gul.walks_v, args.ws, args.ns, neg_dict_v)
    else:
        neg_dict_u, neg_dict_v = gul.get_negs(args.ns)  # 负采样方法
        # print(len(gul.walks_u), len(gul.walks_v))
        print("negative samples is ok...")
        context_dict_u, neg_dict_u = gul.get_context_and_negatives(gul.node_u, gul.walks_u, args.ws, args.ns, neg_dict_u)
        context_dict_v, neg_dict_v = gul.get_context_and_negatives(gul.node_v, gul.walks_v, args.ws, args.ns, neg_dict_v)
        
    # u类别context和neg; v类别context和neg; u节点; v节点;
    return context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, gul.node_u, gul.node_v

In [30]:
def skip_gram(center, contexts, negs, node_list, lam, pa):
    loss = 0
    I_z = {contexts: 1}  # indication function
    
    for node in negs:
        I_z[node] = 0  # 构造负样本
        
    V = np.array(node_list[center]["embedding_vectors"])  # 获取contexts节点的embedding
    update = [[0] * V.size]
    
    for u in I_z.keys():
        if node_list.get(u) is None:
            pass
        Theta = np.array(node_list[u]["context_vectors"])  # theta - context_embedding
        X = float(V.dot(Theta.T))  # v * c
        sigmod = 1.0 / (1 + (math.exp(-X * 1.0)))
        update += pa * lam * (I_z[u] - sigmod) * Theta
        node_list[u]["context_vectors"] += pa * lam * (I_z[u] - sigmod) * V
        try:
            loss += pa * (I_z[u] * math.log(sigmod) + (1 - I_z[u]) * math.log(1 - sigmod))  # cross-entropy
        except:
            pass
        
    return update, loss

In [31]:
def KL_divergence(edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma):
    loss = 0
    e_ij = edge_dict_u[u][v]
    
    update_u = 0
    update_v = 0
    U = np.array(node_list_u[u]['embedding_vectors'])
    V = np.array(node_list_v[v]['embedding_vectors'])
    X = float(U.dot(V, T))
    
    sigmod = 1.0 / (1 + (math.exp(-X * 1.0)))
    
    update_u += gamma * lam * ((e_ij * (1 - sigmod)) * 1.0 / math.log(math.e, math.e)) * V
    update_v += gamma * lam * ((e_ij * (1 - sigmod)) * 1.0 / math.log(math.e, math.e)) * U
    
    try:
        loss += gamma * e_ij * math.log(sigmod)
    except:
        pass
    
    return update_u, update_v, loss

In [32]:
def precision_and_recall(ranked_list,ground_list):
    hits = 0
    for i in range(len(ranked_list)):
        id = ranked_list[i]
        if id in ground_list:
            hits += 1
    pre = hits/(1.0 * len(ranked_list))
    rec = hits/(1.0 * len(ground_list))
    return pre, rec

In [33]:
def AP(ranked_list, ground_truth):
    hits, sum_precs = 0, 0.0
    for i in range(len(ranked_list)):
        id = ranked_list[i]
        if id in ground_truth:
            hits += 1
            sum_precs += hits / (i+1.0)
    if hits > 0:
        return sum_precs / len(ground_truth)
    else:
        return 0.0

In [34]:
def RR(ranked_list, ground_list):
    for i in range(len(ranked_list)):
        id = ranked_list[i]
        if id in ground_list:
            return 1 / (i + 1.0)
    return 0

In [35]:
def IDCG(n):
    idcg = 0
    for i in range(n):
        idcg += 1 / math.log(i+2, 2)
    return idcg

In [36]:
def nDCG(ranked_list, ground_truth):
    dcg = 0
    idcg = IDCG(len(ground_truth))
    for i in range(len(ranked_list)):
        id = ranked_list[i]
        if id not in ground_truth:
            continue
        rank = i+1
        dcg += 1/ math.log(rank+1, 2)
    return dcg / idcg

In [37]:
def top_N(test_u, test_v, test_rate, node_list_u, node_list_v, top_n):
    recommend_dict = {}
    for u in test_u:
        recommend_dict[u] = {}
        for v in test_v:
            if node_list_u.get(u) is None:
                pre = 0
            else:
                U = np.array(node_list_u[u]["embedding_vectors"])
                if node_list_v.get(v) is None:
                    pre = 0
                else:
                    V = np.array(node_list_v[v]["embedding_vectors"])
                    pre = U.dot(V.T)[0][0]
            recommend_dict[u][v] = float(pre)
            
    precision_list = []
    recall_list = []
    ap_list = []
    ndcg_list = []
    rr_list = []
    
    for u in test_u:
        tmp_r = sorted(recommend_dict[u].items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0: min(len(recommend_dict[u]), top_n)]
        tmp_t = sorted(test_rate[u].items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0: min(len(test_rate[u]), top_n)]
        tmp_r_list = []
        tmp_t_list = []
        
        for (item, rate) in tmp_r:
            tmp_r_list.append(item)
            
        for (item, rate) in tmp_t:
            tmp_t_list.append(item)
        
        pre, rec = precision_and_recall(tmp_r_list, tmp_t_list)
        ap = AP(tmp_r_list,tmp_t_list)
        rr = RR(tmp_r_list,tmp_t_list)
        ndcg = nDCG(tmp_r_list,tmp_t_list)
        precision_list.append(pre)
        recall_list.append(rec)
        ap_list.append(ap)
        rr_list.append(rr)
        ndcg_list.append(ndcg)
    
    precison = sum(precision_list) / len(precision_list)
    recall = sum(recall_list) / len(recall_list)
    # print(precison, recall)
    f1 = 2 * precison * recall / (precison + recall)
    map = sum(ap_list) / len(ap_list)
    mrr = sum(rr_list) / len(rr_list)
    mndcg = sum(ndcg_list) / len(ndcg_list)
    return f1, map, mrr, mndcg

In [38]:
def generateFeatureFile(filecase, filevector_u, filevector_v, fileout, factors):
    vectors_u = {}
    vectors_v = {}
    
    with open(filevector_u, "r") as fu:
        for line in fu.readlines():
            items = line.strip().split(" ")
            vectors_u[items[0]] = items[1:]
    
    with open(filevector_v, "r") as fv:
        for line in fv.readlines():
            items = line.strip().split(" ")
            vectors_v[items[0]] = items[1:]
            
    with open(filecase,'r') as fc, open(fileout,'w') as fo:
        for line in fc.readlines():
            items = line.strip().split("\t")
            if vectors_u.get(items[0]) == None:
                vectors_u[items[0]] = ['0'] * factors
            if vectors_v.get(items[1]) == None:
                vectors_v[items[1]] = ['0'] * factors
            if items[-1] == '1':
                fo.write('{}\t{}\t{}\n'.format('\t'.join(vectors_u[items[0]]),'\t'.join(vectors_v[items[1]]), 1))
            else:
                fo.write('{}\t{}\t{}\n'.format('\t'.join(vectors_u[items[0]]),'\t'.join(vectors_v[items[1]]), 0))

In [39]:
def link_prediction(args):
    filecase_a = args.case_train
    filecase_e = args.case_test
    filevector_u = args.vectors_u
    filevector_v = args.vectors_v
    filecase_a_c = r'C:\Users\sss\Desktop\BiNE\data/features_train.dat'
    filecase_e_c = r'C:\Users\sss\Desktop\BiNE\data/features_test.dat'
    generateFeatureFile(filecase_a, filevector_u, filevector_v, filecase_a_c, args.d)
    generateFeatureFile(filecase_e, filevector_u, filevector_v, filecase_e_c, args.d)
    
    df_data_train = pd.read_csv(filecase_a_c, header=None, sep='\t', encoding='utf-8')
    X_train = df_data_train.drop(len(df_data_train.keys()) - 1, axis=1)
    y_train = df_data_train[len(df_data_train.keys()) - 1]
    
    df_data_test = pd.read_csv(filecase_e_c, header=None, sep='\t', encoding='utf-8')
    X_test = df_data_test.drop(len(df_data_train.keys()) - 1, axis=1)
    X_test = X_test.fillna(X_test.mean())
    y_test = df_data_test[len(df_data_test.keys()) - 1]
    y_test_list = list(y_test) 
    
    lg = LogisticRegression(penalty='l2', C=0.001)
    lg.fit(X_train, y_train)
    lg_y_pred_est = lg.predict_proba(X_test)[:,1]
    fpr,tpr,thresholds = metrics.roc_curve(y_test, lg_y_pred_est)
    average_precision = average_precision_score(y_test, lg_y_pred_est)
    os.remove(filecase_a_c)
    os.remove(filecase_e_c)
    
    return metrics.auc(fpr,tpr), average_precision

In [48]:
def train_by_sampling(args):
    model_path = os.path.join(r'C:\Users\sss\Desktop\BiNE/', args.model_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
        
    alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam
    print('======== experiment settings =========')
    print('alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns,args.maxT,args.minT,args.max_iter, args.d))
    
    print('========== processing data ===========')
    dul = DataUtils(model_path)
    if args.rec:
        test_user, test_item, test_rate = dul.read_data(args.test_data)
        
    print("constructing graph....")
    gul = GraphUtils(model_path)
    gul.construct_training_graph(args.train_data)  # train_data='../data/wiki/rating_train.dat'
    edge_dict_u = gul.edge_dict_u  # dict形式的点边关系
    edge_list = gul.edge_list  # list形式的点边关系
    walk_generator(gul,args)  # 生成随机游走
    
    print("getting context and negative samples....")
    context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(gul, args)
    node_list_u, node_list_v = {}, {}
    init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args)  # 初始化节点embedding
    last_loss, count, epsilon = 0, 0, 1e-3
 
    print("============== training ==============")
    for iter in range(0, args.max_iter):
        s1 = "\r[%s%s]%0.2f%%"%("*"* iter," "*(args.max_iter-iter),iter*100.0/(args.max_iter-1))
        loss = 0
        visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys())))  # u类别初始为0
        visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys())))  # v类别初始为0
        random.shuffle(edge_list)  # edge_list: 点边信息
        for i in range(len(edge_list)):
            u, v, w = edge_list[i]

            length = len(context_dict_u[u])  # 周围邻居的数量
            random.shuffle(context_dict_u[u])
            if visited_u.get(u) < length:
                # print(u)
                index_list = list(range(visited_u.get(u),min(visited_u.get(u)+1,length)))
                for index in index_list:
                    context_u = context_dict_u[u][index]  # 选择节点的一个邻居
                    neg_u = neg_dict_u[u][index]  # 选择节点的负采样信息; 负采样本身就是随机的，所以只需打乱context即可，并且多个epoch训练时，负采样样本也不同
                    # center,context,neg,node_list,eta
                    for z in context_u:  # 每一个邻居节点，都进行skip-gram更新embedding
                        tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha)
                        node_list_u[u]['embedding_vectors'] += tmp_z  # 更新节点embedding
                        loss += tmp_loss
                visited_u[u] = index_list[-1]+3

            length = len(context_dict_v[v])
            random.shuffle(context_dict_v[v])
            if visited_v.get(v) < length:
                # print(v)
                index_list = list(range(visited_v.get(v),min(visited_v.get(v)+1,length)))
                for index in index_list:
                    context_v = context_dict_v[v][index]
                    neg_v = neg_dict_v[v][index]
                    # center,context,neg,node_list,eta
                    for z in context_v:
                        tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta)
                        node_list_v[v]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_v[v] = index_list[-1]+3
            # edge_dict_u:边连接的信息
            update_u, update_v, tmp_loss = KL_divergence(edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma)  # 计算KL-deversion
            loss += tmp_loss
            node_list_u[u]['embedding_vectors'] += update_u
            node_list_v[v]['embedding_vectors'] += update_v
        # 求的是梯度上升，loss越大越好
        delta_loss = abs(loss - last_loss)
        if last_loss > loss:
            lam *= 1.05
        else:
            lam *= 0.95
        last_loss = loss
        if delta_loss < epsilon:
            break
        sys.stdout.write(s1)
        sys.stdout.flush()
    save_to_file(node_list_u,node_list_v,model_path,args)
    print("")
    if args.rec:
        print("============== testing ===============")
        f1, map, mrr, mndcg = top_N(test_user,test_item,test_rate,node_list_u,node_list_v,args.top_n)
        print('recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f' % (round(f1,4), round(map,4), round(mrr,4), round(mndcg,4)))
    if args.lip:
        print("============== testing ===============")
        auc_roc, auc_pr = link_prediction(args)
        print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' % (round(auc_roc,4), round(auc_pr,4)))

In [None]:
train_by_sampling(args)

alpha : 0.0100, beta : 0.0100, gamma : 0.1000, lam : 0.0100, p : 0.1500, ws : 5, ns : 4, maxT :  32, minT : 1, max_iter : 50, d : 128
constructing graph....


In [1]:
def train(args):
    model_path = os.path.join(r'C:\Users\sss\Desktop\BiNE/', args.model_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam
    print('======== experiment settings =========')
    print('alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns,args.maxT,args.minT,args.max_iter, args.d))
    print('========== processing data ===========')
    dul = DataUtils(model_path)
    if args.rec:
        test_user, test_item, test_rate = dul.read_data(args.test_data)
    print("constructing graph....")
    gul = GraphUtils(model_path)
    gul.construct_training_graph(args.train_data)
    edge_dict_u = gul.edge_dict_u
    edge_list = gul.edge_list
    walk_generator(gul,args)

    print("getting context and negative samples....")
    context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(gul, args)
    node_list_u, node_list_v = {}, {}
    init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args)

    last_loss, count, epsilon = 0, 0, 1e-3
    print("============== training ==============")
    for iter in range(0, args.max_iter):
        s1 = "\r[%s%s]%0.2f%%"%("*"* iter," "*(args.max_iter-iter),iter*100.0/(args.max_iter-1))
        loss = 0
        num = 0
        visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys())))
        visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys())))

        random.shuffle(edge_list)
        for (u, v, w) in edge_list:
            if visited_u.get(u) == 0 or random.random() > 0.95:
                # print(u)
                length = len(context_dict_u[u])
                index_list = random.sample(list(range(length)), min(length, 1))
                for index in index_list:
                    context_u = context_dict_u[u][index]
                    neg_u = neg_dict_u[u][index]
                    # center,context,neg,node_list,eta
                    for k, z in enumerate(context_u):
                        tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha)
                        node_list_u[z]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_u[u] = 1
            if visited_v.get(v) == 0 or random.random() > 0.95:
                # print(v)
                length = len(context_dict_v[v])
                index_list = random.sample(list(range(length)), min(length, 1))
                for index in index_list:
                    context_v = context_dict_v[v][index]
                    neg_v = neg_dict_v[v][index]
                    # center,context,neg,node_list,eta
                    for k,z in enumerate(context_v):
                        tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta)
                        node_list_v[z]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_v[v] = 1
            # print(len(edge_dict_u))
            update_u, update_v, tmp_loss = KL_divergence(edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma)
            loss += tmp_loss
            node_list_u[u]['embedding_vectors'] += update_u
            node_list_v[v]['embedding_vectors'] += update_v
            count = iter
            num += 1
        delta_loss = abs(loss - last_loss)
        if last_loss > loss:
            lam *= 1.05
        else:
            lam *= 0.95
        last_loss = loss
        if delta_loss < epsilon:
            break
        sys.stdout.write(s1)
        sys.stdout.flush()
    save_to_file(node_list_u,node_list_v,model_path,args)
    print("")
    if args.rec:
        print("============== testing ===============")
        f1, map, mrr, mndcg = top_N(test_user,test_item,test_rate,node_list_u,node_list_v,args.top_n)
        print('recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f' % (round(f1,4), round(map,4), round(mrr,4), round(mndcg,4)))
    if args.lip:
        print("============== testing ===============")
        auc_roc, auc_pr = link_prediction(args)
        print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' % (round(auc_roc,4), round(auc_pr,4)))

In [2]:
def ndarray_tostring(array):
    string = ""
    for item in array[0]:
        string += str(item).strip()+" "
    return string+"\n"

def save_to_file(node_list_u,node_list_v,model_path,args):
    with open(args.vectors_u,"w") as fw_u:
        for u in node_list_u.keys():
            fw_u.write(u+" "+ ndarray_tostring(node_list_u[u]['embedding_vectors']))
    with open(args.vectors_v,"w") as fw_v:
        for v in node_list_v.keys():
            fw_v.write(v+" "+ndarray_tostring(node_list_v[v]['embedding_vectors']))

### 参数设置

In [5]:
parser = ArgumentParser("BiNE", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve')

parser.add_argument('--train-data', default=r'../data/rating_train.dat', help='Input graph file.')
parser.add_argument('--test-data', default=r'../data/rating_test.dat')
parser.add_argument('--vectors-u', default=r'../data/vectors_u.dat', help="file of embedding vectors of U")
parser.add_argument('--vectors-v', default=r'../data/vectors_v.dat', help="file of embedding vectors of V")
parser.add_argument('--case-train', default=r'../data/wiki/case_train.dat', help="file of training data for LR")
parser.add_argument('--case-test', default=r'../data/wiki/case_test.dat', help="file of testing data for LR")

parser.add_argument('--model-name', default='default', help='name of model.')
parser.add_argument('--ws', default=5, type=int, help='window size.')
parser.add_argument('--ns', default=4, type=int, help='number of negative samples.')
parser.add_argument('--d', default=128, type=int, help='embedding size.')
parser.add_argument('--maxT', default=32, type=int, help='maximal walks per vertex.')
parser.add_argument('--minT', default=1, type=int, help='minimal walks per vertex.')
parser.add_argument('--p', default=0.15, type=float, help='walk stopping probability.')
parser.add_argument('--alpha', default=0.01, type=float, help='trade-off parameter alpha.')
parser.add_argument('--beta', default=0.01, type=float, help='trade-off parameter beta.')
parser.add_argument('--gamma', default=0.1, type=float, help='trade-off parameter gamma.')
parser.add_argument('--lam', default=0.01, type=float, help='learning rate lambda.')
parser.add_argument('--max-iter', default=50, type=int, help='maximal number of iterations.')
parser.add_argument('--top-n', default=10, type=int, help='recommend top-n items for each user.')
parser.add_argument('--rec', default=0, type=int, help='calculate the recommendation metrics.')
parser.add_argument('--lip', default=0, type=int, help='calculate the link prediction metrics.')
parser.add_argument('--large', default=0, type=int, help='for large bipartite, 1 do not generate homogeneous graph file; 2 do not generate homogeneous graph')
parser.add_argument('--mode', default='hits', type=str, help='metrics of centrality')

args = parser.parse_args(args=[])

root = r"C:\Users\sss\Desktop\BiNE/data/wiki/"
args.train_data, args.test_data = root + 'rating_train.dat', root + 'rating_test.dat'
args.vectors_u, args.vectors_v = root + 'vectors_u.dat', root + 'vectors_v.dat'
args.case_train, args.case_test = root + 'case_train.dat', root + 'case_test.dat'

In [None]:
train_by_sampling(args)