In [36]:
# Load the datasets
import os.path as osp
from dataset import LiteralLinkPredDataset
import torch

DATASET = "Synthetic"

if not osp.isfile(f'data/{DATASET}/processed.pt'):
    print('Process dataset...')
    dataset = LiteralLinkPredDataset(f'data/{DATASET}')
    torch.save(dataset, f'data/{DATASET}/processed.pt')
print('Load processed dataset...')
dataset = torch.load(f'data/{DATASET}/processed.pt')

Load processed dataset...


In [37]:
# Get number of training, validation and test triples
print(f"Number of training triples: {len(dataset.df_triples_train)}")
print(f"Number of validation triples: {len(dataset.df_triples_val)}")
print(f"Number of test triples: {len(dataset.df_triples_test)}")

Number of training triples: 275270
Number of validation triples: 17535
Number of test triples: 23166


In [38]:
# Get number of numerical literals that have 0 as value
print(f"Number of numerical literals that have 0 as value: {dataset.df_literals_num[dataset.df_literals_num[2] == 0].shape[0]}")
print(f"Out of {dataset.df_literals_num.shape[0]} numerical literals => {dataset.df_literals_num[dataset.df_literals_num[2] == 0].shape[0] / dataset.df_literals_num.shape[0] * 100:.2f}%")

Number of numerical literals that have 0 as value: 0
Out of 14505 numerical literals => 0.00%


In [39]:
# Load the LitWD48k numerical literals
import pandas as pd
litwd48k_num_lits = pd.read_csv('data/LitWD48k/numerical_literals_decimal.txt', sep='\t', header=None)
print(f"Number of numerical literals that have 0 as value: {litwd48k_num_lits[litwd48k_num_lits[2] == 0].shape[0]}")
print(f"Out of {litwd48k_num_lits.shape[0]} numerical literals => {litwd48k_num_lits[litwd48k_num_lits[2] == 0].shape[0] / litwd48k_num_lits.shape[0] * 100:.2f}%")

# Print the index of the numerical literals that have 0 as value
print(litwd48k_num_lits[litwd48k_num_lits[2] == 0].index)

Number of numerical literals that have 0 as value: 2486
Out of 148707 numerical literals => 1.67%
Index([  1551,   1768,   2007,   2052,   2178,   2297,   2961,   3156,   3570,
         3591,
       ...
       148200, 148223, 148265, 148292, 148300, 148354, 148356, 148357, 148380,
       148633],
      dtype='int64', length=2486)


In [50]:
import numpy as np

# Load LitWD48K vocab
vocab_e1 = np.load("data/LitWD48K/vocab_e1", allow_pickle=True)

# Inspect the shape of the vocab
print(len(vocab_e1))

# Inspect the 4 list elements of the vocab
print(vocab_e1[3])

4
{}


In [51]:
import tqdm

def load_literals_and_attr_relations_num(self):
    # with E = number of embeddings, R = number of attributive relations, V = feature dim
    print('Start loading numerical literals: E x R')
    attr_relations_num_unique = list(self.df_literals_num[1].unique())

    print("Unique numerical attributive relations: ", len(attr_relations_num_unique))

    attr_relation_num_2_id = {attr_relations_num_unique[i]: i for i in range(len(attr_relations_num_unique))}

    # Map entities to ids
    # Drop all literals that have entities that are not in the training set
    self.df_literals_num = self.df_literals_num[self.df_literals_num[0].isin(self.entity2id.keys())]
    self.df_literals_num[0] = self.df_literals_num[0].map(self.entity2id).astype(int)
    # Map attributive relations to ids
    self.df_literals_num[1] = self.df_literals_num[1].map(attr_relation_num_2_id).astype(int)
    # Change literal values to float
    self.df_literals_num[2] = self.df_literals_num[2].astype(float)

    # Extract numerical literal feature vectors for each entity for literal values and attributive relations
    features_num = []
    features_num_attr = []
    for i in tqdm(range(len(self.entities) + 2)):
        df_i = self.df_literals_num[self.df_literals_num[0] == i]

        feature_i = torch.zeros(len(attr_relations_num_unique))
        feature_i_attr = torch.zeros(len(attr_relations_num_unique))
        for index, row in df_i.iterrows():
            # Numerical literal values: row[1] = attributive relation index, row[2] = literal value as float
            feature_i[int(row[1])] = float(row[2])

            # One-hot encoding for attributive relations
            feature_i_attr[int(row[1])] = 1

        features_num.append(feature_i)
        features_num_attr.append(feature_i_attr)
    features_num = torch.stack(features_num)
    features_num_attr = torch.stack(features_num_attr)

    # Normalize numerical literals and attributive relations
    max_lit, min_lit = torch.max(features_num, dim=0).values, torch.min(features_num, dim=0).values
    features_num = (features_num - min_lit) / (max_lit - min_lit + 1e-8)
    features_num_attr -= features_num_attr.mean(dim=0, keepdim=True)

    return features_num, features_num_attr