<a href="https://colab.research.google.com/github/nsstnaka/machine_learning_handson/blob/master/knowledge_graph_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -nc https://github.com/nju-websoft/JAPE/raw/master/data/dbp15k.tar.gz
!tar zxf dbp15k.tar.gz

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import math

In [None]:
# settings
EMBEDDING_DIM = 50
TRAIN_BATCH_SIZE = 256
TEST_BATCH_SIZE = 128
SCORE_FUNC = 'L1'  # 'L1' or 'L2'
NORMALIZE = True  # set True to normalize embedding
EPOCHS = 20

In [None]:
df = pd.read_csv('dbp15k/ja_en/s_triples', sep='\t', names=['head', 'relation', 'tail'])
print(len(df))
df.head(10)

In [None]:
df = df.applymap(lambda x: x.split('/')[-1])
df.head(10)

In [None]:
entity_set = set(df['head']) | set(df['tail'])
relation_set = set(df['relation'])
len(entity_set), len(relation_set)

In [None]:
entity_idx_dic = {name: idx for idx, name in enumerate(entity_set)}
relation_idx_dic = {name: idx for idx, name in enumerate(relation_set)}

In [None]:
df['head_idx'] = df['head'].apply(lambda x: entity_idx_dic[x])
df['relation_idx'] = df['relation'].apply(lambda x: relation_idx_dic[x])
df['tail_idx'] = df['tail'].apply(lambda x: entity_idx_dic[x])
df.head(10)

In [None]:
df.drop(['head', 'relation', 'tail'], axis=1, inplace=True)
df.head(5)

In [None]:
head_count = df.groupby('relation_idx')['head_idx'].apply(set).apply(len)
tail_count = df.groupby('relation_idx')['tail_idx'].apply(set).apply(len)
tail_prob = tail_count / (head_count + tail_count)

In [None]:
def data_generator(num_entities, triples, prob):
    triple_set = {(row[0], row[1], row[2]) for row in triples}
    for head, relation, tail in np.random.permutation(triples):
        neg_head = None
        neg_tail = None
        if random.random() > prob[relation]:
            # replace head
            neg_tail = tail
            while True:
                neg_head = random.randrange(num_entities)
                if (neg_head, relation, tail) not in triple_set:
                    break
        else:
            # replace tail
            neg_head = head
            while True:
                neg_tail = random.randrange(num_entities)
                if (head, relation, neg_tail) not in triple_set:
                    break
        yield [head, relation, tail, neg_head, neg_tail]

In [None]:
train_ds = tf.data.Dataset.from_generator(data_generator, args=[len(entity_idx_dic), df.values, tail_prob], output_types=(tf.int64), output_shapes=(5,)).batch(TRAIN_BATCH_SIZE)

In [None]:
def score_func(heads, tails, relations):
    #return tf.square(tf.norm(heads + relations - tails, ord=2, axis=-1))
    if SCORE_FUNC == 'L1':
        return tf.norm(heads + relations - tails, ord=1, axis=-1)
    elif SCORE_FUNC == 'L2':
        return tf.square(tf.norm(heads + relations - tails, ord=2, axis=-1))
    else:
        raise Exception('Invalid SCORE_FUNC:', SCORE_FUNC)

In [None]:
bound = 6 / math.sqrt(EMBEDDING_DIM)
initializer = tf.keras.initializers.RandomUniform(minval=-bound, maxval=bound)
entity_embeddings = tf.keras.layers.Embedding(len(entity_idx_dic), EMBEDDING_DIM,
                                              embeddings_initializer=initializer)
relation_embeddings = tf.keras.layers.Embedding(len(entity_idx_dic), EMBEDDING_DIM,
                                                embeddings_initializer=initializer)

In [None]:
margin = 1.0
optimizer = tf.keras.optimizers.Adam()
#optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
@tf.function
def train_step(inputs):
    with tf.GradientTape() as tape:
        heads = entity_embeddings(inputs[:, 0])
        relations = relation_embeddings(inputs[:, 1])
        tails = entity_embeddings(inputs[:, 2])
        neg_heads = entity_embeddings(inputs[:, 3])
        neg_tails = entity_embeddings(inputs[:, 4])
        if NORMALIZE:
            heads = tf.nn.l2_normalize(heads, axis=-1)
            tails = tf.nn.l2_normalize(tails, axis=-1)
            #relations = tf.nn.l2_normalize(relations, axis=-1)
            neg_heads = tf.nn.l2_normalize(neg_heads, axis=-1)
            neg_tails = tf.nn.l2_normalize(neg_tails, axis=-1)
        pos_scores = score_func(heads, tails, relations)
        neg_scores = score_func(neg_heads, neg_tails, relations)
        loss = tf.reduce_sum(tf.maximum(pos_scores + margin - neg_scores, 0.0))
    #print(variables)
    variables = entity_embeddings.trainable_variables + relation_embeddings.trainable_variables
    grads = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(grads, variables))
    return loss

In [None]:
for e in range(1, EPOCHS+1):
    total_loss = 0.0
    for batch_data in train_ds:
        loss = train_step(batch_data)
        total_loss += loss.numpy()
    print("Epoch {}: loss={:.6f}".format(e, total_loss))
    #evaluate(valid_ds)