In [1]:
import os, random, datetime, pickle
from datetime import datetime

import numpy as np
import pandas as pd
import scipy.sparse as sp
import tensorflow as tf

In [2]:
# reader

class Reader(object):
    
    def read(self, data_path):
        handled_path = data_path + 'basic_trainer_saved.pkl'

        if os.path.exists(handled_path):
            print('load file from local')
            (self._entity_num, self._relation_num, self._relation_num_for_eval, self._train_data, self._test_data,
             self._valid_data) = pickle.load(open(handled_path, 'rb'))
        else:
            self.read_data()
            self.merge_id()
            self.add_reverse()
            self.reindex_kb()
            self.gen_t_label()

            print('start save dfs')
            saved = (
                self._entity_num, self._relation_num, self._relation_num_for_eval, self._train_data, self._test_data,
                self._valid_data)
            pickle.dump(saved, open(handled_path, 'wb'))

        self.gen_filter_mat()
        
        self._ent_num = self._entity_num
        self._rel_num = self._relation_num
        self._ent_mapping = pd.DataFrame({'kb_1':{}, 'kb_2':{}})
        self._rel_mapping = pd.DataFrame({'kb_1':{}, 'kb_2':{}})
        self._ent_testing = pd.DataFrame({'kb_1':{}, 'kb_2':{}})
        self._rel_testing = pd.DataFrame({'kb_1':{}, 'kb_2':{}})
        
        
        self._kb = self._train_data
        
        return 

    def read_data(self):
        pass

    def merge_id(self):
        self._train_data['h_id'] = self._e_id[self._train_data.h].values
        self._train_data['r_id'] = self._r_id[self._train_data.r].values
        self._train_data['t_id'] = self._e_id[self._train_data.t].values

        self._test_data['h_id'] = self._e_id[self._test_data.h].values
        self._test_data['r_id'] = self._r_id[self._test_data.r].values
        self._test_data['t_id'] = self._e_id[self._test_data.t].values

        self._valid_data['h_id'] = self._e_id[self._valid_data.h].values
        self._valid_data['r_id'] = self._r_id[self._valid_data.r].values
        self._valid_data['t_id'] = self._e_id[self._valid_data.t].values
    
    def gen_t_label(self):
        full = pd.concat([self._train_data, self._test_data, self._valid_data], ignore_index=True)
        f_t_labels = full['t_id'].groupby([full['h_id'], full['r_id']]).apply(lambda x: pd.unique(x.values))
        f_t_labels.name = 't_label'

        self._test_data = self._test_data.join(f_t_labels, on=['h_id', 'r_id'])

        self._valid_data = self._valid_data.join(f_t_labels, on=['h_id', 'r_id'])


    def add_reverse(self):
        def add_reverse_for_data(data):
            reversed_data = data.rename(columns={'h_id': 't_id', 't_id': 'h_id'})
            reversed_data.r_id += self._relation_num
            data = pd.concat(([data, reversed_data]), ignore_index=True)
            return data

        self._train_data = add_reverse_for_data(self._train_data)
        self._test_data = add_reverse_for_data(self._test_data)
        self._valid_data = add_reverse_for_data(self._valid_data)
        self._relation_num_for_eval = self._relation_num
        self._relation_num *= 2
#         print (self._relation_num, self._relation_num_for_eval)

    def reindex_kb(self):
        train_data = self._train_data
        test_data = self._test_data
        valid_data = self._valid_data
        eids = pd.concat([train_data.h_id, train_data.t_id,], ignore_index=True)

        tv_eids = np.unique(pd.concat([test_data.h_id, test_data.t_id, valid_data.t_id, valid_data.h_id]))
        not_train_eids = tv_eids[~np.in1d(tv_eids, eids)]

        rids = pd.concat([train_data.r_id,],ignore_index=True)
        

        def gen_map(eids, rids):
            e_num = eids.groupby(eids.values).size().sort_values()[::-1]
            not_train = pd.Series(np.zeros_like(not_train_eids), index=not_train_eids)
            e_num = pd.concat([e_num, not_train])

            r_num = rids.groupby(rids.values).size().sort_values()[::-1]
            e_map = pd.Series(range(e_num.shape[0]), index=e_num.index)
            r_map = pd.Series(range(r_num.shape[0]), index=r_num.index)
            return e_map, r_map
        
        def remap_kb(kb, e_map, r_map):
            kb.loc[:, 'h_id'] = e_map.loc[kb.h_id.values].values
            kb.loc[:, 'r_id'] = r_map.loc[kb.r_id.values].values
            kb.loc[:, 't_id'] = e_map.loc[kb.t_id.values].values
            return kb
        
        def remap_id(s, rm):
            s = rm.loc[s.values].values
            return s
        
        e_map, r_map = gen_map(eids, rids)
        self._e_map, self._r_map = e_map, r_map
        
        self._train_data = remap_kb(train_data, e_map, r_map)
        self._valid_data = remap_kb(self._valid_data, e_map, r_map)
        self._test_data = remap_kb(self._test_data, e_map, r_map)
        
        self._e_id = remap_id(self._e_id, e_map)
        self._r_id = remap_id(self._r_id, r_map)
        
        return not_train_eids
    
    
    def in2d(self, arr1, arr2):
        """Generalisation of numpy.in1d to 2D arrays"""

        assert arr1.dtype == arr2.dtype

        arr1_view = np.ascontiguousarray(arr1).view(np.dtype((np.void,
                                                              arr1.dtype.itemsize * arr1.shape[1])))
        arr2_view = np.ascontiguousarray(arr2).view(np.dtype((np.void,
                                                              arr2.dtype.itemsize * arr2.shape[1])))
        intersected = np.in1d(arr1_view, arr2_view)
        return intersected.view(np.bool).reshape(-1)





    def gen_filter_mat(self):
        def gen_filter_vector(r):
            v = np.ones(self._entity_num)
            v[r] = -1
            return v

        print('start gen filter mat')



        self._tail_valid_filter_mat = np.stack(self._valid_data.t_label.apply(gen_filter_vector).values)
        self._tail_test_filter_mat = np.stack(self._test_data.t_label.apply(gen_filter_vector).values)



    def gen_label_mat_for_train(self):
        def gen_train_relation_label_vac(r):
            c = pd.value_counts(r)
            values = 1. * c.values / c.sum()
            return np.stack([c.index, values], axis=1)

        def gen_train_entity_label_vac(r):
            indices = np.stack([r.label_id.values, r.values], axis=1)
            values = np.ones_like(r.values, dtype=np.int)
            return tf.SparseTensor(indices=indices, values=values, dense_shape=[1, self._entity_num])

        tr = self._train_data
        print('start gen t_label')
        labels = tr['t_id'].groupby([tr['h_id'], tr['r_id']]).size()
        labels = pd.Series(range(labels.shape[0]), index=labels.index)
        labels.name = 'label_id'
        tr = tr.join(labels, on=['h_id', 'r_id'])

        self._train_data = tr
        sp_tr = tf.SparseTensor(tr[['label_id', 't_id']].values, np.ones([len(tr)], dtype=np.float32), dense_shape=[len(tr), self._entity_num])

        self._label_indices, self._label_values = sp_tr.indices[:], sp_tr.values[:]


class FreeBaseReader(Reader):

    def read_data(self):
        path = self._options.data_path
        tr = pd.read_csv(path + 'train.txt', header=None, sep='\t', names=['h', 't', 'r'])
        te = pd.read_csv(path + 'test.txt', header=None, sep='\t', names=['h', 't', 'r'])
        val = pd.read_csv(path + 'valid.txt', header=None, sep='\t', names=['h', 't', 'r'])

        e_id = pd.read_csv(path + 'entity2id.txt', header=None, sep='\t', names=['e', 'eid'])
        e_id = pd.Series(e_id.eid.values, index=e_id.e.values)
        r_id = pd.read_csv(path + 'relation2id.txt', header=None, sep='\t', names=['r', 'rid'])
        r_id = pd.Series(r_id.rid.values, index=r_id.r.values)
        
        

        self._entity_num = e_id.shape[0]
        self._relation_num = r_id.shape[0]


        self._train_data = tr
        self._test_data = te
        self._valid_data = val

        self._e_id, self._r_id = e_id, r_id


class WordNetReader(Reader):

    def read_data(self):
        path = self._options.data_path
        tr = pd.read_csv(path+'train.txt', header=None, sep='\t', names=['h', 'r', 't'])
        te = pd.read_csv(path + 'test.txt', header=None, sep='\t', names=['h', 'r', 't'])
        val = pd.read_csv(path + 'valid.txt', header=None, sep='\t', names=['h', 'r', 't'])
        
        r_list = pd.unique(pd.concat([tr.r, te.r, val.r]))
        r_list = pd.Series(r_list, index=np.arange(r_list.shape[0]))
        
        e_list = pd.unique(pd.concat([tr.h, te.h, val.h, tr.t, te.t, val.t, ]))
        e_list = pd.Series(e_list, index=np.arange(e_list.shape[0]))
        
        
        e_id = pd.Series(e_list.index, index=e_list.values)
        r_id = pd.Series(r_list.index, index=r_list.values)




        self._entity_num = e_id.shape[0]
        self._relation_num = r_id.shape[0]

        self._train_data = tr
        self._test_data = te
        self._valid_data = val

        self._e_id, self._r_id = e_id, r_id

In [3]:
# path sampler


class BasicSampler(object):

    def sample_paths(self, repeat_times=2):
        opts = self._options

        kb = self._kb.copy()

        kb = kb[['h_id', 'r_id', 't_id']]

        # sampling triples with the h_id-(r_id,t_id) form.

        rtlist = np.unique(kb[['r_id', 't_id']].values, axis=0)

        rtdf = pd.DataFrame(rtlist, columns=['r_id', 't_id'])

        rtdf = rtdf.reset_index().rename({'index': 'tail_id'}, axis='columns')

        rtkb = kb.merge(
            rtdf, left_on=['r_id', 't_id'], right_on=['r_id', 't_id'])

        htail = np.unique(rtkb[['h_id', 'tail_id']].values, axis=0)

        htailmat = csr_matrix((np.ones(len(htail)), (htail[:, 0], htail[:, 1])),
                              shape=(model._ent_num, rtlist.shape[0]))

        # calulate corss-KG bias at first
        em = pd.concat(
            [model._ent_mapping.kb_1, model._ent_mapping.kb_2]).values

        rtkb['across'] = rtkb.t_id.isin(em)
        rtkb.loc[rtkb.across, 'across'] = opts.beta
        rtkb.loc[rtkb.across == 0, 'across'] = 1-opts.beta

        rtailkb = rtkb[['h_id', 't_id', 'tail_id', 'across']]

        def gen_tail_dict(x):
            return x.tail_id.values, x.across.values / x.across.sum()

        rtailkb = rtailkb.groupby('h_id').apply(gen_tail_dict)

        rtailkb = pd.DataFrame({'tails': rtailkb})

        # start sampling

        hrt = np.repeat(kb.values, repeat_times, axis=0)

        # for initial triples
        def perform_random(x):
            return np.random.choice(x.tails[0], 1, p=x.tails[1].astype(np.float))

        # else
        def perform_random2(x):

            # calculate depth bias
            pre_c = htailmat[np.repeat(x.pre, x.tails[0].shape[0]), x.tails[0]]
            pre_c[pre_c == 0] = opts.alpha
            pre_c[pre_c == 1] = 1-opts.alpha
            p = x.tails[1].astype(np.float).reshape(
                [-1, ]) * pre_c.A.reshape([-1, ])
            p = p / p.sum()
            return np.random.choice(x.tails[0], 1, p=p)

        rt_x = rtailkb.loc[hrt[:, 2]].apply(perform_random, axis=1)
        rt_x = rtlist[np.concatenate(rt_x.values)]

        rts = [hrt, rt_x]
        c_length = 5
        while(c_length < opts.max_length):
            curr = rtailkb.loc[rt_x[:, 1]]
            curr.loc[:, 'pre'] = hrt[:, 0]

            rt_x = curr.apply(perform_random2, axis=1)
            rt_x = rtlist[np.concatenate(rt_x.values)]

            rts.append(rt_x)
            c_length += 2

        data = np.concatenate(rts, axis=1)
        data = pd.DataFrame(data)
        
        self._train_data = data
        data.to_csv('%spaths_%.1f_%.1f' % (opts.data_path, opts.alpha, opts.beta))
        

In [4]:
# model
class RSN4KGC(FreeBaseReader):
    def __init__(self, options, session):
        self._options = options
        self._session = session


    def init_variables(self):
        options = self._options
        hidden_size = options.hidden_size


        self._entity_embedding = tf.get_variable(
            'entity_embedding',
            [self._ent_num, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer()
        )


        self._relation_embedding = tf.get_variable(
            'relation_embedding',
            [self._rel_num, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer()
        )

        self._rel_w = tf.get_variable(
            "relation_softmax_w",
            [self._rel_num, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer()
        )
        self._rel_b = tf.get_variable(
            "relation_softmax_b",
            [self._rel_num],
            initializer=tf.constant_initializer(0)
        )
        self._ent_w = tf.get_variable(
            "entity_softmax_w",
            [self._ent_num, hidden_size],
            initializer=tf.contrib.layers.xavier_initializer()
        )
        self._ent_b = tf.get_variable(
            "entity_softmax_b",
            [self._ent_num],
            initializer=tf.constant_initializer(0)
        )

        self._lr = tf.Variable(options.learning_rate, trainable=False)

        self._optimizer = tf.train.AdamOptimizer(options.learning_rate)

    def bn(self, inputs, is_train=True, reuse=True):
        return tf.contrib.layers.batch_norm(inputs,
                                            center=True,
                                            scale=True,
                                            is_training=is_train,
                                            reuse=reuse,
                                            scope='bn',
                                            data_format='NCHW'
                                            )

    def lstm_cell(self, drop=True, keep_prob=0.5, num_layers=2, hidden_size=None):
        if not hidden_size:
            hidden_size = self._options.hidden_size

        def basic_lstm_cell():
            return tf.contrib.rnn.LSTMCell(
                num_units=hidden_size,
                initializer=tf.orthogonal_initializer,
                forget_bias=1,
                reuse=tf.get_variable_scope().reuse,
                activation=tf.identity
            )


        def drop_cell():
            return tf.contrib.rnn.DropoutWrapper(
                basic_lstm_cell(),
                output_keep_prob=keep_prob
            )

        if drop:
            gen_cell = drop_cell
        else:
            gen_cell = basic_lstm_cell
        
        if num_layers==0:
            return gen_cell()
        
        cell = tf.contrib.rnn.MultiRNNCell(
            [gen_cell() for _ in range(num_layers)],
            state_is_tuple=True,
        )
        return cell

    def sampled_loss(self, inputs, labels, w, b, weight=1, is_entity=False):
        num_sampled = min(self._options.num_samples, w.shape[0]//3)
        
        labels = tf.reshape(labels, [-1, 1])

        losses = tf.nn.sampled_softmax_loss(
            weights=w,
            biases=b,
            labels=labels,
            inputs=tf.reshape(inputs, [-1, int(w.shape[1])]),
            num_sampled=num_sampled,
            num_classes=w.shape[0],
            partition_strategy='div',
        )

        return losses * weight

    def logits(self, input, predict_relation=True):
        if not predict_relation:
            w = self._ent_w
            b = self._ent_b
        else:
            w = self._rel_w
            b = self._rel_b
            
        return tf.nn.bias_add(tf.matmul(input, tf.transpose(w)), b)

    def sample(self, data):
        choices = np.random.choice(len(data), size=len(data), replace=False)
        return data.iloc[choices]
    
    def padding_data(self, data):
        padding_num = self._options.batch_size - len(data) % self._options.batch_size
        data = np.concatenate([data, np.zeros((padding_num, data.shape[1]), dtype=np.int32)])
        return data, padding_num

In [5]:
# build tensorflow graph


# build an RSN of length l
def build_sub_graph(self, length=15, reuse=False):
    options = self._options
    hidden_size = options.hidden_size
    batch_size = options.batch_size

    seq = tf.placeholder(
        tf.int32, [batch_size, length], name='seq'+str(length))

    e_em, r_em = self._entity_embedding, self._relation_embedding

    # seperately read, and then recover the order
    ent = seq[:, :-1:2]
    rel = seq[:, 1::2]

    ent_em = tf.nn.embedding_lookup(e_em, ent)
    rel_em = tf.nn.embedding_lookup(r_em, rel)

    em_seq = []
    for i in range(length-1):
        if i % 2 == 0:
            em_seq.append(ent_em[:, i//2])
        else:
            em_seq.append(rel_em[:, i//2])

    # seperately bn
    with tf.variable_scope('input_bn'):
        if not reuse:
            bn_em_seq = [tf.reshape(self.bn(em_seq[i], reuse=(
                i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]
        else:
            bn_em_seq = [tf.reshape(
                self.bn(em_seq[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]

    bn_em_seq = tf.concat(bn_em_seq, axis=1)

    ent_bn_em = bn_em_seq[:, ::2]

    with tf.variable_scope('rnn', reuse=reuse):

        cell = self.lstm_cell(True, options.keep_prob, options.num_layers)

        outputs, state = tf.nn.dynamic_rnn(cell, bn_em_seq,  dtype=tf.float32)

    rel_outputs = outputs[:, 1::2, :]
    outputs = [outputs[:, i, :] for i in range(length-1)]

    ent_outputs = outputs[::2]

    # RSN
    with tf.variable_scope('resnet', reuse=reuse):
        res_rel_outputs = tf.contrib.layers.fully_connected(rel_outputs, hidden_size, biases_initializer=None, activation_fn=None) +\
            tf.contrib.layers.fully_connected(
                ent_bn_em, hidden_size, biases_initializer=None, activation_fn=None)

    # recover the order
    res_rel_outputs = [res_rel_outputs[:, i, :] for i in range((length-1)//2)]
    outputs = []
    for i in range(length-1):
        if i % 2 == 0:
            outputs.append(ent_outputs[i//2])
        else:
            outputs.append(res_rel_outputs[i//2])

    # output bn
    with tf.variable_scope('output_bn'):
        if reuse:
            bn_outputs = [tf.reshape(
                self.bn(outputs[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]
        else:
            bn_outputs = [tf.reshape(self.bn(outputs[i], reuse=(
                i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]

    def cal_loss(bn_outputs, seq):
        losses = []

        decay = 0.8
        for i, output in enumerate(bn_outputs):
            if i % 2 == 0:
                losses.append(self.sampled_loss(
                    output, seq[:, i+1], self._rel_w, self._rel_b, weight=decay**(0), is_entity=i))
            else:
                losses.append(self.sampled_loss(
                    output, seq[:, i+1], self._ent_w, self._ent_b, weight=decay**(0), is_entity=i))
        losses = tf.stack(losses, axis=1)
        return losses

    seq_loss = cal_loss(bn_outputs, seq)

    losses = tf.reduce_sum(seq_loss) / batch_size

    return losses, seq


# build the main graph
def build_graph(self):
    options = self._options

    loss, seq = build_sub_graph(self, length=options.max_length, reuse=False)

    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 2.0)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = self._optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.train.get_or_create_global_step()
        )

    self._seq, self._loss, self._train_op = seq, loss, train_op

In [6]:
# training procedure

def seq_train(self, data, choices=None, epoch=None):
    opts = self._options
    
    # shuffle data
    choices = np.random.choice(len(data), size=len(data), replace=True)
    batch_size = opts.batch_size
    
    
    num_batch = len(data) // batch_size
    
    fetches = {
        'loss': self._loss,
        'train_op': self._train_op
        }
    
    losses = 0 
    for i in range(num_batch):
        
        one_batch_choices = choices[i * batch_size : (i + 1) * batch_size]
        one_batch_data = data.iloc[one_batch_choices]

        feed_dict = {}
        seq = one_batch_data.values[:, :opts.max_length]
        feed_dict[self._seq] = seq
        vals = self._session.run(fetches, feed_dict)

        del one_batch_data

        loss = vals['loss']
        losses += loss
        print('\r%i/%i, batch_loss:%f' % (i, num_batch, loss), end='')
    self._last_mean_loss = losses / num_batch

    return self._last_mean_loss

In [7]:
#build the graph for entity prediction (KG completion)

def build_eval_ep(self, length=3, reuse=True):
    options = self._options
    hidden_size = options.hidden_size
    batch_size = options.batch_size

    seq = tf.placeholder(tf.int32, [batch_size, length], name='eval_seq')
    
    e_em, r_em = self._entity_embedding, self._relation_embedding
    
    ent = seq[:, :-1:2]
    rel = seq[:, 1::2]

    ent_em = tf.nn.embedding_lookup(e_em, ent)
    rel_em = tf.nn.embedding_lookup(r_em, rel)
    
    
    em_seq = []
    for i in range(length-1):
        if i % 2 == 0:
            em_seq.append(ent_em[:, i//2])
        else:
            em_seq.append(rel_em[:, i//2])
    

    with tf.variable_scope('input_bn', reuse=reuse):
        if not reuse:
            bn_em_seq = [tf.reshape(self.bn(em_seq[i], reuse=(
                i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]
        else:
            bn_em_seq = [tf.reshape(
                self.bn(em_seq[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]
    
    
    
    bn_em_seq = tf.concat(bn_em_seq, axis=1)
    
    ent_bn_em = bn_em_seq[:, ::2]
    
        

    with tf.variable_scope('rnn', reuse=reuse):

        cell = self.lstm_cell(True, options.keep_prob, options.num_layers)

        outputs, state = tf.nn.dynamic_rnn(cell, bn_em_seq,  dtype=tf.float32)
        
    
    rel_outputs = outputs[:, 1::2, :]
    outputs = [outputs[:, i, :] for i in range(length-1)]
    
    ent_outputs = outputs[::2]


    with tf.variable_scope('resnet', reuse=reuse):
        res_rel_outputs = tf.contrib.layers.fully_connected(rel_outputs, hidden_size, biases_initializer=None, activation_fn=None) +\
                            tf.contrib.layers.fully_connected(ent_bn_em, hidden_size, biases_initializer=None, activation_fn=None)


    res_rel_outputs = [res_rel_outputs[:, i, :] for i in range((length-1)//2)]

    outputs=[]
    
    for i in range(length-1):
        if i % 2==0:
            outputs.append(ent_outputs[i//2])
        else:
            outputs.append(res_rel_outputs[i//2])
            
    with tf.variable_scope('output_bn', reuse=reuse):
        if reuse:
            bn_outputs = [tf.reshape(
                self.bn(outputs[i], reuse=True), [-1, hidden_size]) for i in range(length-1)]
        else:
            bn_outputs = [tf.reshape(self.bn(outputs[i], reuse=(
                i is not 0)), [-1, hidden_size]) for i in range(length-1)]
    
    
    logits = self.logits(bn_outputs[1], predict_relation=False)

    probs = tf.nn.softmax(logits)

    self._eval_seq = seq

    self._entity_probs = probs


In [8]:
#evaluate the performance on KG completion (entity prediction)

def eval_entity_prediction(model, data, filter_mat, method='min', return_ranks=False, return_probs=False, return_label_probs=False):
    options = model._options
    batch_size = options.batch_size

    label = data[:, 2]
    
    data, padding_num = model.padding_data(data)

    num_batch = len(data) // batch_size 
    
    eval_seq, fectch_entity_probs = model._eval_seq, model._entity_probs
    
    probs = []
    for i in range(num_batch):

        feed_dict = {}
        feed_dict[eval_seq] = data[i * batch_size:(i + 1) * batch_size]
        
        probs.append(sess.run(fectch_entity_probs, feed_dict))
    probs = np.concatenate(probs)[:len(data) - padding_num]
    if return_label_probs:
        return probs[range(len(label)), label]
    
    if return_probs:
        return probs
    filter_probs = probs * filter_mat
    filter_probs[range(len(label)), label] = probs[range(len(label)), label]
    filter_ranks = cal_ranks(filter_probs, method=method, label=label)
    if return_ranks:
        return filter_ranks
    _, f_h_1, _ = cal_performance(filter_ranks, top=1)
    f_m_r, f_h_10, f_mrr = cal_performance(filter_ranks)
    
    return (f_h_1, f_h_10, f_mrr, f_m_r)

In [9]:
# some tool functions
def cal_ranks(probs, method, label):
    if method == 'min':
        probs = probs - probs[range(len(label)), label].reshape(len(probs), 1)
        ranks = (probs > 0).sum(axis=1) + 1
    else:
        ranks = pd.DataFrame(probs).rank(axis=1, ascending=False, method=method)
        ranks = ranks.values[range(len(label)), label]
    return ranks

def cal_performance(ranks, top=10):
    m_r = sum(ranks) * 1.0 / len(ranks)
    h_10 = sum(ranks <= top) * 1.0 / len(ranks)
    mrr = (1. / ranks).sum() / len(ranks)
    return m_r, h_10, mrr

def padding_data(data, options, batch_size):
    padding_num = batch_size - len(data) % batch_size
    data = pd.concat([data, pd.DataFrame(np.zeros((padding_num, data.shape[1])), dtype=np.int32, columns=data.columns)],ignore_index=True, axis=0)
    return data, padding_num

def in2d(arr1, arr2):
    """Generalisation of numpy.in1d to 2D arrays"""

    assert arr1.dtype == arr2.dtype

    arr1_view = np.ascontiguousarray(arr1).view(np.dtype((np.void,
                                                          arr1.dtype.itemsize * arr1.shape[1])))
    arr2_view = np.ascontiguousarray(arr2).view(np.dtype((np.void,
                                                          arr2.dtype.itemsize * arr2.shape[1])))
    intersected = np.in1d(arr1_view, arr2_view)
    return intersected.view(np.bool).reshape(-1)

def write_to_log(path, content):
    with open(path, 'a+') as f:
        print(content, file=f)

In [10]:
#parameter settings

class Options(object):
    pass

opts = Options()
opts.hidden_size = 256
opts.num_layers = 2
opts.batch_size = 2048
opts.learning_rate = 0.0001 # for FB15K-237, the learning rate should decrease to 0.00001
opts.num_samples = 2048*5
opts.keep_prob = 0.5

opts.max_length=7
opts.alpha = 0.7
opts.beta = 0.5


opts.data_path = 'data/FB15k/'
opts.log_file_path = 'logs/%s%dl_%s.log' % (opts.data_path.replace(
    '/', '-'), opts.max_length, datetime.now().strftime('%y-%m-%d-%H-%M'))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [None]:
#initial model

sess = tf.InteractiveSession(config=config)
model = RSN4KGC(options=opts, session=sess)

model.read(data_path=model._options.data_path)
model.init_variables()

sequence_datapath = '%spaths_%.1f_%.1f' % (
    model._options.data_path, model._options.alpha, model._options.beta)

if not os.path.exists(sequence_datapath):
    print('start to sample paths')
    model.sample_paths()
    train_data = model._train_data
else:
    print('load existing training sequences')
    train_data = pd.read_csv(sequence_datapath, index_col=0)



# build tensorflow graph and init all tensors
build_graph(model)
build_eval_ep(model)
tf.global_variables_initializer().run()

In [12]:
# initial training settings

write_to_log(opts.log_file_path, opts.__dict__)
epoch = 0
last_mean_loss=0

max_hits1, times, max_times = 0, 0, 3

# we transform the subject entity prediction (?, r, o) to (o, r-, ?) for convenience
test_data = model._test_data[['h_id', 'r_id', 't_id']].values
filter_mat = model._tail_test_filter_mat

valid_data = model._valid_data[['h_id', 'r_id', 't_id']].values
vfilter_mat = model._tail_valid_filter_mat

In [13]:
r = eval_entity_prediction(model, data=valid_data, filter_mat=vfilter_mat)
msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (epoch, r[0],r[1],r[2],r[3], last_mean_loss)
print('\n'+msg)
write_to_log(opts.log_file_path, msg)


for i in range(epoch, 200):
    last_mean_loss = seq_train(model, train_data)
    epoch += 1
    
    # evaluation
    if i % 5 ==0:
        r = eval_entity_prediction(model, data=valid_data, filter_mat=vfilter_mat)
        msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (i+1, r[0],r[1],r[2],r[3], last_mean_loss)
        print('\n'+msg)
        write_to_log(opts.log_file_path, msg)
        
        hits1 = r[0]
        
        # early stop
        if hits1 > max_hits1:
            max_hits1 = hits1
            times = 0
        else:
            times += 1

        if times >= max_times:
            break
        
#evaluation on testing data
r = eval_entity_prediction(model, data=test_data, filter_mat=filter_mat, method='average')
msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (epoch, r[0],r[1],r[2],r[3], last_mean_loss)
print('\n'+msg)
write_to_log(opts.log_file_path, msg)


epoch:0, Hits@1:0.000, Hits@10:0.001, MRR:0.001, MR:7414.918, mean_loss:0.000
942/943, batch_loss:24.576305
epoch:1, Hits@1:0.183, Hits@10:0.382, MRR:0.252, MR:332.066, mean_loss:33.139
942/943, batch_loss:15.754082
epoch:6, Hits@1:0.489, Hits@10:0.762, MRR:0.586, MR:64.169, mean_loss:15.935
942/943, batch_loss:14.618158
epoch:11, Hits@1:0.595, Hits@10:0.822, MRR:0.676, MR:50.366, mean_loss:14.690
942/943, batch_loss:14.370004
epoch:16, Hits@1:0.645, Hits@10:0.850, MRR:0.719, MR:46.891, mean_loss:14.343
942/943, batch_loss:14.146772
epoch:21, Hits@1:0.670, Hits@10:0.862, MRR:0.740, MR:45.969, mean_loss:14.187
942/943, batch_loss:14.086667
epoch:26, Hits@1:0.680, Hits@10:0.865, MRR:0.748, MR:46.136, mean_loss:14.099
942/943, batch_loss:13.916122
epoch:31, Hits@1:0.688, Hits@10:0.868, MRR:0.754, MR:46.376, mean_loss:14.034
942/943, batch_loss:13.882902
epoch:36, Hits@1:0.695, Hits@10:0.869, MRR:0.758, MR:46.269, mean_loss:13.989
942/943, batch_loss:13.909997
epoch:41, Hits@1:0.696, Hits