# Introduction

In this notebook we show how to generate the embedding vectors for the Link Prediction approaches using DGL-KE.

To be able to use DGL-KE, please follow the instructions on their [GitHub page](https://github.com/awslabs/dgl-ke). As of now, it is as easy as running the following cell:

In [None]:
!pip install dgl
!pip install dglke

However, you also have to make sure to install a backend used by DGL-KE. In our case, we use [PyTorch](https://pytorch.org).

# Retrieve DBpedia files

In [None]:
!wget --directory-prefix=data/ http://downloads.dbpedia.org/2016-10/dbpedia_2016-10.nt
!wget --directory-prefix=data/ http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_en.ttl.bz2
!wget --directory-prefix=data/ http://downloads.dbpedia.org/2016-10/core-i18n/en/instance_types_transitive_en.ttl.bz2
!wget --directory-prefix=data/ http://downloads.dbpedia.org/2016-10/core-i18n/en/mappingbased_objects_en.ttl.bz2

# Convert DBpedia files to DGL-KE format

In [None]:
import os
import bz2
import re
import pandas as pd

# Load all triples from files in `data`
source_dir = './data'
object_pattern = re.compile(b'\<(.*)\> \<(.*)\> \<(.*)\> \.\\n')
triples = []
for filename in os.listdir(source_dir):
    open_func = bz2.open if filename.endswith('.bz2') else open
    with open_func(os.path.join(source_dir, filename), mode='rb') as f:
        for line in f:
            object_triple = object_pattern.match(line)
            if object_triple:
                triples.append((x.decode('utf-8') for x in object_triple.groups()))
df = pd.DataFrame(data=triples, columns=['sub', 'pred', 'obj'])

In [None]:
# Create files in DGL-KE input format in the folder `dglke`
!mkdir -p dglke
target_dir = './dglke/'

# create ids
entities = set(df['sub'].unique()) | set(df['obj'].unique())
entity_ids = {e: str(idx) for idx, e in enumerate(entities)}
relation_ids = {rel: str(idx) for idx, rel in enumerate(df['pred'].unique())}

# create id files
def _write_file(data, separator, filename):
    filepath = os.path.join(target_dir, filename)
    with open(filepath, mode='w') as f:
        for vals in data:
            f.write(f'{separator.join(vals)}\n')

_write_file([(idx, e) for e, idx in entity_ids.items()], '\t', './dglke/entities.dict')
_write_file([(idx, r) for r, idx in relation_ids.items()], '\t', './dglke/relations.dict')

# create training file
training_data = []
for _, row in df.iterrows():
    training_data.append((entity_ids[row['sub']], relation_ids[row['pred']], entity_ids[row['obj']]))
_write_file(training_data, '\t', './dglke/train.tsv')

# Run approaches and write embedding vectors

In [None]:
# configure the GPU id to use
GPU = '0'

import subprocess
import numpy as np

def run_for_approaches(approach_configs):
    for config in approach_configs:
        print(f"Creating embeddings for {config['name']}..")
        _train_embedding(config)
        print(f"Creating txt vectors for {config['name']}..")
        _convert_to_txt_vectors(config)

def _train_embedding(config):
    os.environ['DGLBACKEND'] = 'pytorch'
    command = [
        'dglke_train',
        '--model_name', config['model'],
        '--gpu', GPU,
        '--dataset', 'DBpedia',
        '--data_files', 'dglke/entities.dict', 'dglke/relations.dict', 'dglke/train.tsv',
        '--format', 'udd_hrt',
        '--batch_size', '1000',
        '--neg_sample_size', '200',
        '--hidden_dim', '200',
        '--gamma', config['gamma'],
        '--lr', config['lr'],
        '--max_step', '1000000',
        '--log_interval', '1000',
        '-adv',
        '--mix_cpu_gpu'
    ]
    if 'regularization_coef' in config:
        command.extend(['--regularization_coef', config['regularization_coef']])
    if config['model'] == 'RotatE':
        command[20] = '100'
        command.append('-de')
        
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(process.communicate()[1])
    

def _convert_to_txt_vectors(config):
    checkpoint_file = f"./ckpts/{config['model']}_DBpedia_0/DBpedia_{config['model']}_entity.npy"
    vector_file = f"./dglke/vectors_dbpedia_{config['name']}.txt"
    
    entity_dict = pd.read_csv('./dglke/entities.dict', index_col=0, sep='\t', header=None, names=['entity'])
    data = pd.DataFrame(data=np.load(checkpoint_file), columns=range(200))
    entity_vectors = pd.merge(entity_dict, data, left_index=True, right_index=True)
    entity_vectors.to_csv(vector_file, sep=' ', header=False, index=False)

In [None]:
# define config for all embedding approaches and run DGL-KE
TRAIN_CONFIG = [
    {
        'name': 'TransE-L1',
        'model': 'TransE_l1',
        'gamma': '12.0',
        'lr': '0.007',
        'regularization_coef': '2e-07'
    },
    {
        'name': 'TransE-L2',
        'model': 'TransE_l2',
        'gamma': '10.0',
        'lr': '0.1',
        'regularization_coef': '1e-9'
    },
    {
        'name': 'TransR',
        'model': 'TransR',
        'gamma': '8.0',
        'lr': '0.015',
        'regularization_coef': '5e-8'
    },
    {
        'name': 'RotatE',
        'model': 'RotatE',
        'gamma': '12.0',
        'lr': '0.01',
        'regularization_coef': '1e-7'
    },
    {
        'name': 'DistMult',
        'model': 'DistMult',
        'gamma': '143.0',
        'lr': '0.08'
    },
    {
        'name': 'RESCAL',
        'model': 'RESCAL',
        'gamma': '24.0',
        'lr': '0.03'
    },
    {
        'name': 'ComplEx',
        'model': 'ComplEx',
        'gamma': '143.0',
        'lr': '0.1',
        'regularization_coef': '2.00E-06'
    },
]

run_for_approaches(TRAIN_CONFIG)