# Imports and configs

In [1]:
import tensorflow as tf
import os
import numpy as np
from tqdm import tqdm
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
DATA_BASE_PATH = './data'
KB_PATH = './data/kb.txt'

TEST_SPLIT = 0.15


# Data prepration

## Data loader

In [3]:
class KBManager:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.entity_map = {}
        self.relation_map = {}
        self._build_vocabs()
        
    @staticmethod
    def extend_vocab(word, vocab):
        if word not in vocab:
            vocab[word] = len(vocab)
    
    def _build_vocabs(self):
        with open(self.data_dir, 'r') as f:
            data = f.read()
            lines = data.strip().split('\n')
            for line in lines:
                subj, rel, obj = line.split('|')
                self.extend_vocab(subj, self.entity_map)
                self.extend_vocab(obj, self.entity_map)
                self.extend_vocab(rel, self.relation_map)
    
    
    def load_er_vocab(self):
        result = {}
        with open(self.data_dir, 'r') as f:
            data = f.read()
            lines = data.strip().split('\n')
            for line in lines:
                subj, rel, obj = line.split('|')
                subj_idx = self.entity_map[subj]
                rel_idx = self.relation_map[rel]
                obj_idx = self.entity_map[obj]
                
                er_tuple = (subj_idx, rel_idx)
                if er_tuple in result:
                    result[er_tuple].append(obj_idx)
                else:
                    result[er_tuple] = [obj_idx]
        return result
                
    

kb_mgr = KBManager(KB_PATH)
er_vocab = kb_mgr.load_er_vocab()

print(kb_mgr.relation_map)

{'directed_by': 0, 'written_by': 1, 'starred_actors': 2, 'release_year': 3, 'in_language': 4, 'has_tags': 5, 'has_genre': 6, 'has_imdb_votes': 7, 'has_imdb_rating': 8}


## Dataset construction

### train/test/validation split

In [4]:
raw_ds = list(kb_mgr.load_er_vocab().items())

train_raw_ds, test_raw_ds = train_test_split(raw_ds, test_size=TEST_SPLIT, shuffle=True)

print(len(train_raw_ds), len(test_raw_ds))

68556 12099


### dataset generator function

In [5]:
def get_batch(dataset,  target_dim, batch_size=64):
    batch_idx = 0
    
    remainder = len(dataset) % batch_size
    if remainder != 0:
        dataset = dataset[:-remainder]
        
    while batch_idx < len(dataset):
        batch = dataset[batch_idx:batch_idx+batch_size, :]
        Xs = np.array(list(zip(*batch[:,0])))
        Ys = np.zeros((batch_size, target_dim))
        for idx, targets in enumerate(batch[:,1]):
            Ys[idx][targets] = 1
        yield Xs, Ys
        batch_idx += batch_size
                    
next(iter(get_batch(np.array([[(2,3), [0,1]], [(3,4), [1,2]]]), target_dim=3, batch_size=2)))


(array([[2, 3],
        [3, 4]]), array([[1., 1., 0.],
        [0., 1., 1.]]))

# Model

## Building model

In [9]:
EMBEDDING_DIM = 512

class KBGModel(keras.Model):
    
    def __init__(self, entity_dim, relation_dim, hidden_dim):
        super(KBGModel, self).__init__()
    
        self.entity_dim = entity_dim
        self.relation_dim = relation_dim
        self.hidden_dim = hidden_dim
        
        self.entity_encoder = keras.layers.Embedding(
            self.entity_dim,
            self.hidden_dim,
            embeddings_regularizer=keras.regularizers.l2(0.1)
        )
        
        self.relation_encoder = keras.layers.Embedding(
            self.relation_dim,
            self.hidden_dim,
            input_shape=(),
        )
        
        self.head_bn = keras.layers.BatchNormalization()
        self.head_drpout = keras.layers.Dropout(0.3)
        self.rel_drpout = keras.layers.Dropout(0.4)
        self.score_bn = keras.layers.BatchNormalization()
        self.output_drpout = keras.layers.Dropout(0.5)
        
    def get_score(self, head, relation, entity_encoder):
        
        assert self.hidden_dim % 2 == 0
        hidden_dim_slice = int(self.hidden_dim/2)

        head_norm = self.head_bn(tf.reshape(head, (-1, hidden_dim_slice, 2)))
        head_drp = self.head_drpout(head_norm)

        head_drp = tf.reshape(head_drp, (-1, self.hidden_dim))

        re_head = tf.slice(head_drp, [0, 0], [-1, hidden_dim_slice])
        im_head = tf.slice(head_drp, [0, hidden_dim_slice], [-1, -1])

        relation_drp = self.rel_drpout(tf.squeeze(relation))
        re_relation = tf.slice(relation_drp, [0, 0], [-1, hidden_dim_slice])
        im_relation = tf.slice(relation_drp, [0, hidden_dim_slice], [-1, -1])

        re_tail = tf.slice(tf.squeeze(entity_encoder.weights), [0, 0], [-1, hidden_dim_slice])
        im_tail = tf.slice(tf.squeeze(entity_encoder.weights), [0, hidden_dim_slice], [-1, -1])

        re_score = re_head * re_relation - im_head * im_relation
        im_score = re_head * im_relation + im_head * re_relation

        score = tf.stack([re_score, im_score], axis=1)
        score_bn = self.score_bn(score)
        score_drp = self.output_drpout(score_bn)

        score_drp = tf.reshape(score_drp, (-1, self.hidden_dim))
        re_score = tf.slice(score_drp, [0, 0], [-1, hidden_dim_slice])
        im_score = tf.slice(score_drp, [0, hidden_dim_slice], [-1, -1])

        scores = tf.add(
            tf.matmul(re_score, re_tail, transpose_b=True),
            tf.matmul(im_score, im_tail, transpose_b=True)
        )

        return scores
        
    def call(self, subj_ids, rel_ids):
        entity_embedding = self.entity_encoder(subj_ids)
        rel_embedding = self.relation_encoder(rel_ids)
        
        scores = self.get_score(entity_embedding, rel_embedding, self.entity_encoder)
        prediction = tf.sigmoid(scores)
        
        return prediction

entity_dim = len(kb_mgr.entity_map)
relation_dim = len(kb_mgr.relation_map)
model = KBGModel(entity_dim, relation_dim, EMBEDDING_DIM)

model(
    np.array([425, 77]),
    np.array([1,2]),
    training=False
)

model.summary()

Model: "kbg_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  22135808  
_________________________________________________________________
embedding_3 (Embedding)      multiple                  4608      
_________________________________________________________________
batch_normalization_2 (Batch multiple                  8         
_________________________________________________________________
dropout_3 (Dropout)          multiple                  0         
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
_________________________________________________________________
batch_normalization_3 (Batch multiple                  1024      
_________________________________________________________________
dropout_5 (Dropout)          multiple                  

## Defining loss function and gradient methods

In [10]:
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam()
train_loss = keras.metrics.Mean(name='train_loss')
validation_loss = keras.metrics.Mean(name='validation_loss')

@tf.function
def train_step(subj_ids, rel_ids, targets):
    with tf.GradientTape() as tape:
        predictions = model(
            subj_ids,
            rel_ids,
            training=True
        )
        loss = loss_fn(y_true=targets, y_pred=predictions)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    train_loss(loss)
    
@tf.function
def validation_step(subj_ids, rel_ids, targets):
    predictions = model(
        subj_ids,
        rel_ids,
        training=False
    )
    loss = loss_fn(y_true=targets, y_pred=predictions)
    validation_loss(loss)

## Training

metrics to calculate hits and evaluate training is in the source code

In [None]:
EPOCHS = 10
BATCH_SIZE = 256
VALIDATION_BATCH_SIZE = 2048
TRAIN_LOG_STEP = 20

train_ds = np.array(train_raw_ds)
test_ds = np.array(test_raw_ds)

for epoch in range(EPOCHS):
    train_loss.reset_states()
    validation_loss.reset_states()
    
    iteration = 0
    for x, y in get_batch(train_ds, entity_dim, batch_size=BATCH_SIZE):
        subj_ids, rel_ids = list(zip(x))    
        train_step(np.array(subj_ids), np.array(rel_ids), y)
        
        if not iteration % TRAIN_LOG_STEP:
            print('training loss in iteration {}: {}'.format(iteration, train_loss.result()))
        
        iteration += 1
        
    for x, y in get_batch(test_ds, entity_dim, batch_size=VALIDATION_BATCH_SIZE):
        subj_ids, rel_ids = list(zip(x))    
        validation_step(np.array(subj_ids), np.array(rel_ids), y)
    
    print("epoch:{} validation_loss:{}".format(epoch, validation_loss.result()))

# Evaluation

In [14]:
model = KBGModel(entity_dim, relation_dim, EMBEDDING_DIM)
model.load_weights('complex_graph_embedding/data/saved_models/complex')

model(
    np.array([425, 77]),
    np.array([1,2]),
    training=False
)

model.summary()

Model: "kbg_model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     multiple                  22135808  
_________________________________________________________________
embedding_11 (Embedding)     multiple                  4608      
_________________________________________________________________
batch_normalization_10 (Batc multiple                  8         
_________________________________________________________________
dropout_15 (Dropout)         multiple                  0         
_________________________________________________________________
dropout_16 (Dropout)         multiple                  0         
_________________________________________________________________
batch_normalization_11 (Batc multiple                  1024      
_________________________________________________________________
dropout_17 (Dropout)         multiple                  

In [16]:
model(
    np.array([425, 77]),
    np.array([1,2]),
    training=False
)

<tf.Tensor: shape=(2, 43234), dtype=float32, numpy=
array([[1.2240209e-05, 1.8087108e-04, 7.7390405e-06, ..., 3.5553062e-06,
        3.7217113e-05, 1.1254337e-05],
       [6.3883745e-06, 6.3407322e-05, 7.9821511e-06, ..., 4.7250278e-06,
        1.8054196e-05, 6.6062339e-06]], dtype=float32)>