# Data handling

In [1]:
import rdflib
import csv
from torch_geometric.data import Data
import torch
import numpy as np

In [2]:
def _store_dict(dict_path, list_to_store):
    with open(dict_path, 'wt') as file:
        writer = csv.writer(file, delimiter='\t')
        for idx, val in enumerate(list_to_store):
            writer.writerow([idx, val])

In [3]:
def _read_dict_as_list(dict_path):
    with open(dict_path) as tsv:
        list_to_store = []
        reader = csv.reader(tsv, delimiter='\t')
        for row in reader:
            list_to_store.append(row[1])
        return list_to_store

In [4]:
def get_relation_type(relation):
    r_split = relation.split("/")
    return r_split[len(r_split)-1]

In [5]:
entities_and_type = {}
relations = []
triples = []
# Process the Knowledge Graph
g = rdflib.Graph()
g.parse('knowledge_graph.ttl', format='turtle')

for s, p, o in g:
    if str(o).find('https://') != -1 or str(o).find('http://') != -1:
        if str(p) != "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
            if not str(s) in entities_and_type:
                entities_and_type[(str(s))] =""
            if not str(p) in relations:
                relations.append(str(p))
            if not str(o) in entities_and_type:
                entities_and_type[str(o)]=""
            
            triples.append((s,p,o))
        else:
            if str(s) in entities_and_type.keys():
                split_o = str(o).split('/')
                entities_and_type[str(s)]= split_o[len(split_o)-1]

In [6]:
entity_types_count = {}
entities_no_empty = []
for entity in entities_and_type:
    tipo = entities_and_type[entity].lower()
    if tipo != "":
        entity_types_count[tipo] = entity_types_count.get(tipo, 0)+1
        entities_no_empty.append(entity)


# Store entities and relations as dictionaries
_store_dict("entities.dict", entities_no_empty)
_store_dict("relations.dict", relations)

I want to create a dataset with:
- Homo nodes with OHE
- Hetero edges (id for edge types are in a tensor edge_type)

In [7]:
#ONE HOT ENCODER AS DISPATCH TABLE
OHE = {}
for i,key in enumerate(entity_types_count.keys()):
    OHE[key] = [1 if j==i else 0 for j in range(len(entity_types_count.keys()))]

In [8]:
OHE

{'film': [1, 0, 0, 0, 0, 0, 0, 0, 0],
 'director': [0, 1, 0, 0, 0, 0, 0, 0, 0],
 'city': [0, 0, 1, 0, 0, 0, 0, 0, 0],
 'televisionshow': [0, 0, 0, 1, 0, 0, 0, 0, 0],
 'actor': [0, 0, 0, 0, 1, 0, 0, 0, 0],
 'person': [0, 0, 0, 0, 0, 1, 0, 0, 0],
 'productioncompany': [0, 0, 0, 0, 0, 0, 1, 0, 0],
 'country': [0, 0, 0, 0, 0, 0, 0, 1, 0],
 'creativework': [0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [9]:
#entities = _read_dict_as_list("entities.dict")
entities = entities_no_empty

In [10]:
nodes_ohe = {}
for i,entity in enumerate(entities_no_empty):
    tipo = entities_and_type[str(entity)].lower()
    nodes_ohe[i] = OHE[tipo]

In [11]:
subject_dict = {}
object_dict = {}
for triple in triples:
    s = str(triple[0])
    p = str(triple[1])
    o = str(triple[2])

    if s in entities and o in entities:
        p_type = get_relation_type(p)
        s_type = entities_and_type[s].lower()
        o_type = entities_and_type[o].lower()

        if(s_type != "" and o_type != ""):
            key_t = (s_type, p_type, o_type)
            if key_t in list(subject_dict.keys()):
                subject_dict[key_t].append(entities.index(str(s)))
                object_dict[key_t].append(entities.index(str(o)))
            else:
                subject_dict[key_t] = [entities.index(str(s))]
                object_dict[key_t] = [entities.index(str(o))]

         
        #data[s_type, p_type, o_type].edge_index[0].append(entities.index(str(s)))
        #data[s_type, p_type, o_type].edge_index[1].append(entities.index(str(o)))

In [12]:
data = Data()
#data.x = torch.Tensor([[1] for i in range(len(entities_no_empty))])
#data.x = torch.Tensor([node_feature for i,node_feature in sorted(nodes_ohe.items())])
"""Feature strutturali"""
data.x = torch.Tensor([[] for i in range(len(entities_no_empty))])

In [13]:
data

Data(x=[12225, 9])

In [14]:
lol = [[],[]]
edge_type_name = []
edge_type = []
num_rel=0
for triple in subject_dict.keys():
    lol[0].extend(subject_dict[triple])
    lol[1].extend(object_dict[triple])
    edge_type_name.extend([triple[1] for j in range(len(subject_dict[triple]))])
    edge_type.extend([num_rel for j in range(len(subject_dict[triple]))])
    num_rel+=1

In [15]:
data.edge_index = torch.Tensor(lol).long()
data.edge_type = torch.Tensor(edge_type)

In [16]:
data

Data(x=[12225, 9], edge_index=[2, 20573], edge_type=[20573])

In [17]:
from torch_geometric.transforms import LocalDegreeProfile

transform = LocalDegreeProfile()
data = transform(data)

In [18]:
data

Data(x=[12225, 14], edge_index=[2, 20573], edge_type=[20573])

# R-GCN Implementation

In [19]:
from torch_geometric.nn import RGCNConv

class RGCN(torch.nn.Module):
    def __init__(self,num_rel):
        super().__init__()
        self.conv1 = RGCNConv(data.num_features,4,num_rel) #num_rel := number of edge_types
        self.conv2 = RGCNConv(4,2,num_rel)
        
    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()
        
    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x,edge_index,edge_type)
        x = x.relu()
        x = self.conv2(x,edge_index,edge_type)
        return x

# Preprocessing

In [20]:
#Perform random link split
from torch_geometric.transforms import RandomLinkSplit

link_split = RandomLinkSplit(num_val=0.0,num_test=0.25)
train_link, val_link, test_link = link_split(data)

In [21]:
from random import randint

In [22]:
edges_in_test = list([(int(z),int(w)) for z, w in 
                zip(test_link.edge_label_index[0][:test_link.edge_label_index.size(1)//2],
                                                  test_link.edge_label_index[1][:test_link.edge_label_index.size(1)//2])])

In [23]:
edges_in_data = [(int(x),int(y)) for x,y in zip(data.edge_index[0],data.edge_index[1])]

In [24]:
index_of_edges_in_test = [edges_in_data.index(edges_in_test[i]) for i in range(len(edges_in_test))]

In [25]:
edge_type = train_link.edge_type
edge_type_neg = torch.Tensor([randint(0,num_rel-1) for i in range(train_link.edge_label.size(0)//2)])
train_link.edge_label_type = torch.cat((edge_type,edge_type_neg))

In [26]:
edge_type = data.edge_type[index_of_edges_in_test]
edge_type_neg = torch.Tensor([randint(0,num_rel-1) for i in range(test_link.edge_label.size(0)//2)])
test_link.edge_label_type = torch.cat((edge_type,edge_type_neg))

In [27]:
from sklearn.metrics import roc_auc_score

In [28]:
rel_weight = torch.nn.Parameter(torch.randn(num_rel, 2))

In [29]:
model = RGCN(num_rel)
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion =  torch.nn.BCEWithLogitsLoss() #change loss function

def train_linkpre():
    
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_link.x, train_link.edge_index, train_link.edge_type)  # Perform a single forward pass.
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[train_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[train_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    #out_sim = out_src * out_dst #dotproduct
    #pred = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #pred = torch.sum(out_sim,dim=-1)
    
    # 3 - DistMult
    pred = torch.sum(out_src * rel_weight[train_link.edge_label_type.long()]* out_dst, dim=-1)
    
    
    loss = criterion(pred, train_link.edge_label.type_as(pred)) 
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test_linkpre(test_link):
    model.eval()
    out = model(test_link.x, test_link.edge_index, test_link.edge_type)
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[test_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[test_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    #out_sim = out_src * out_dst
    #h = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #h = torch.sum(out_sim,dim=-1)
    
    # 3 - DistMult
    h = torch.sum(out_src * rel_weight[test_link.edge_label_type.long()]* out_dst, dim=-1)
    
    pred_cont = torch.sigmoid(h).cpu().detach().numpy()
    
    # EVALUATION
    test_label = test_link.edge_label.cpu().detach().numpy() #retrieve test set labels
    test_roc_score = roc_auc_score(test_label, pred_cont) #comput AUROC score for test set
    
    return test_roc_score


for epoch in range(1, 801):
    loss = train_linkpre()
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
print('Training end')

Training end


In [30]:
roc_train = test_linkpre(train_link)
roc_test = test_linkpre(test_link)
print(f'Train AUROC: {roc_train:.4f}\nTest AUROC: {roc_test:.4f}')

Train AUROC: 0.9210
Test AUROC: 0.8249
