In [107]:
import os
import shutil
import random
import numpy as np
import pandas as pd

# Synthetic Dataset Generation

The Synthetic dataset is a dataset that is generated from the FB15k-237 dataset. The dataset is generated to test the performance of the model on a dataset where the model has to understand both the structure and the literal values of the entities. The dataset is generated as follows:

In [108]:
dataset_name = 'Synthetic'

# should not be modified
literal_relation = '/m/has_value'
relational_relation = '/m/is_a'
class_high = '/m/high'
class_low = '/m/low'

In [109]:
if 'dataset_name' in os.listdir(f'data/'):
    print('Attention: the directory is not empty. This will overwrite existing files.')
else:
    os.mkdir(f'data/{dataset_name}')
    os.mkdir(f'data/{dataset_name}/literals')
    print('Directories created.')

Directories created.


In [110]:
# load training file and triple -> class mapping dataframe
train_triples = pd.read_csv('./data/FB15k-237/train.txt', sep='\t', header=None)
class_mapping_df = pd.read_csv('FB15k-237_class_mapping.csv', sep=';', header=0)

In [111]:
# collect relevant URIs
uris = train_triples[0].unique().tolist() + train_triples[2].unique().tolist()
human_uris = [x for x in class_mapping_df[class_mapping_df["class_label"] == "human"]["dataset_entity"].to_list() if
              x in uris]

In [112]:
human_uris

['/m/010hn',
 '/m/010p3',
 '/m/010xjr',
 '/m/0113sg',
 '/m/011_3s',
 '/m/011hdn',
 '/m/011k4g',
 '/m/011lpr',
 '/m/011lvx',
 '/m/011s9r',
 '/m/011vx3',
 '/m/011w20',
 '/m/011xjd',
 '/m/011zd3',
 '/m/011zf2',
 '/m/011zwl',
 '/m/01200d',
 '/m/0121rx',
 '/m/012201',
 '/m/0126rp',
 '/m/0126y2',
 '/m/01271h',
 '/m/0127gn',
 '/m/0127m7',
 '/m/0127s7',
 '/m/0127xk',
 '/m/012_53',
 '/m/012bk',
 '/m/012c6j',
 '/m/012c6x',
 '/m/012cj0',
 '/m/012cph',
 '/m/012d40',
 '/m/012dr7',
 '/m/012dtf',
 '/m/012g92',
 '/m/012gbb',
 '/m/012gq6',
 '/m/012gx2',
 '/m/012j5h',
 '/m/012j8z',
 '/m/012ky3',
 '/m/012ljv',
 '/m/012pd4',
 '/m/012q4n',
 '/m/012rng',
 '/m/012s5j',
 '/m/012t1',
 '/m/012v1t',
 '/m/012v9y',
 '/m/012vby',
 '/m/012vct',
 '/m/012vd6',
 '/m/012vf6',
 '/m/012wg',
 '/m/012x2b',
 '/m/012x4t',
 '/m/012xdf',
 '/m/012ycy',
 '/m/012ykt',
 '/m/012z8_',
 '/m/012zng',
 '/m/01304j',
 '/m/01309x',
 '/m/0130sy',
 '/m/0131kb',
 '/m/0132k4',
 '/m/01337_',
 '/m/0133sq',
 '/m/0133x7',
 '/m/013423',
 '/m/0134w7

In [113]:
# randomly sample literal values for all URIs
uri_to_value = {uri: np.random.rand() for uri in uris}

In [114]:
# split URIs into train, test, and validation
# this split is just assigns where to add the /m/is_a triples

num_humans = len(human_uris)
random.shuffle(human_uris)

uris_split = {'test': human_uris[:int(num_humans * 0.15)],
              'valid': human_uris[int(num_humans * 0.15):int(num_humans * 0.15) * 2],
              'train': human_uris[int(num_humans * 0.15) * 2:]}

In [115]:
# create /m/has_value triples and /m/is_a (depending on the sampled values, threshold 0.5) triples for all human entities
attributive_triples = []
relational_triples = {}

for split, uris in uris_split.items():
    relational_triples[split] = []
    for uri in uris:
        relational_triples[split].append(
            (uri, relational_relation, class_high if uri_to_value[uri] > 0.5 else class_low))

for uri, value in uri_to_value.items():
    attributive_triples.append((uri, literal_relation, str(value)))

In [116]:
#attributive_triples
relational_triples

{'test': [('/m/0frnff', '/m/is_a', '/m/high'),
  ('/m/02t1cp', '/m/is_a', '/m/low'),
  ('/m/0d9xq', '/m/is_a', '/m/high'),
  ('/m/034bs', '/m/is_a', '/m/low'),
  ('/m/01kph_c', '/m/is_a', '/m/low'),
  ('/m/065ydwb', '/m/is_a', '/m/high'),
  ('/m/083chw', '/m/is_a', '/m/low'),
  ('/m/0djtky', '/m/is_a', '/m/low'),
  ('/m/0309lm', '/m/is_a', '/m/high'),
  ('/m/08jtv5', '/m/is_a', '/m/low'),
  ('/m/02_pft', '/m/is_a', '/m/low'),
  ('/m/07s3vqk', '/m/is_a', '/m/high'),
  ('/m/0lk90', '/m/is_a', '/m/high'),
  ('/m/073w14', '/m/is_a', '/m/low'),
  ('/m/019fz', '/m/is_a', '/m/high'),
  ('/m/0840vq', '/m/is_a', '/m/low'),
  ('/m/01vyp_', '/m/is_a', '/m/low'),
  ('/m/02qx69', '/m/is_a', '/m/low'),
  ('/m/03t8v3', '/m/is_a', '/m/high'),
  ('/m/06y9c2', '/m/is_a', '/m/high'),
  ('/m/0bqsy', '/m/is_a', '/m/high'),
  ('/m/03rx9', '/m/is_a', '/m/low'),
  ('/m/03f1r6t', '/m/is_a', '/m/high'),
  ('/m/03fqv5', '/m/is_a', '/m/high'),
  ('/m/053vcrp', '/m/is_a', '/m/low'),
  ('/m/0kr7k', '/m/is_a', '/m/h

In [117]:
# dump literal triples to file
with open(f'data/{dataset_name}/literals/numerical_literals.txt', 'w') as f:
    for triple in attributive_triples:
        f.write("\t".join(triple) + "\n")

In [118]:
# dump relational train triples to file
with open(f'data/{dataset_name}/train_value.txt', 'w') as f:
    for triple in relational_triples['train']:
        f.write("\t".join(triple) + "\n")

In [119]:
# generate relational test triples
test_relational = []

# neg_samples are random non-human entities that should not be assigned to /m/low or /m/high
neg_samples = class_mapping_df[class_mapping_df["class_label"] != "human"]['dataset_entity'].unique().tolist()
random.shuffle(neg_samples)

# generate the /m/is_a triples for the relevant entities
# for the human entities, we are going to investigate if /m/low or /m/high is assigned correctly -> investigate literal understanding
# for the non-human entities, we are going to investigate if /m/low or /m/high is not assigned as this never occurred in the training data -> investigate literal & structural understanding
for entity in uris_split['test'] + neg_samples[:675]:
    test_relational.append((entity, relational_relation, class_low))
    test_relational.append((entity, relational_relation, class_high))

In [120]:
# dump relational test triples to file
with open(f'data/{dataset_name}/test_value.txt', 'w') as f:
    for triple in test_relational:
        f.write("\t".join(triple) + "\n")

In [121]:
# add FB15k-237 triples to the dataset
for split in ['train', 'test', 'valid']:
    shutil.copyfile(f'./data/FB15k-237/{split}.txt', f'./data/{dataset_name}/{split}.txt')

for split in ['train', 'test']:
    with open(f'./data/{dataset_name}/{split}_value.txt') as train_value_file:
        with open(f'./data/{dataset_name}/{split}.txt', 'a') as train_file:
            train_file.write(train_value_file.read())