# Data handling

In [18]:
import rdflib
import csv
from torch_geometric.data import Data
import torch
import numpy as np
from rdflib.namespace import OWL, RDF, RDFS,XSD, Namespace


In [19]:
prefix = "https://dbpedia.org/ontology/"
possible_types = {}

In [20]:
def _store_dict(dict_path, list_to_store):
    with open(dict_path, 'wt') as file:
        writer = csv.writer(file, delimiter='\t')
        for idx, val in enumerate(list_to_store):
            writer.writerow([idx, val])

In [21]:
def _read_dict_as_list(dict_path):
    with open(dict_path) as tsv:
        list_to_store = []
        reader = csv.reader(tsv, delimiter='\t')
        for row in reader:
            list_to_store.append(row[1])
        return list_to_store

In [22]:
def get_relation_type(relation):
    r_split = relation.split("/")
    return r_split[len(r_split)-1]

In [23]:
ontology = rdflib.Graph()
ontology.parse('data/ontologia.ttl', format='ttl')

<Graph identifier=N6d304ea1371f41a0b39f09323e5041d3 (<class 'rdflib.graph.Graph'>)>

In [24]:
ontology.bind("dbo", Namespace("http://dbpedia.org/ontology/"))
ontology.bind("dbr", Namespace("http://dbpedia.org/resource/"))
ontology.bind("rdfs", Namespace("http://www.w3.org/2000/01/rdf-schema#"))
ontology.bind("owl", Namespace("http://www.w3.org/2002/07/owl#"))
ontology.bind("rdf", Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))

In [25]:
def get_possible_types(subj_type, obj_type):
    if (subj_type,obj_type) not in possible_types:
        q = "SELECT DISTINCT ?property WHERE {"+\
        "{ ?property rdfs:domain dbo:"+subj_type+". ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+subj_type +" rdfs:subClassOf ?superclass. dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 ."+\
        "  ?property rdfs:domain ?superclass . ?property rdfs:range ?superclass2 "+\
        "} }"
        result = ontology.query(q)
        results = []
        for res in result:
            results.append(str(res[0]))
        
        q2 = "SELECT DISTINCT ?property WHERE {"+\
        "{dbo:"+subj_type +" rdfs:subClassOf ?superclass. "+\
        " ?property rdfs:domain ?superclass . ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 . ?property rdfs:domain dbo:"+\
        subj_type+" . ?property rdfs:range ?superclass2}}"
        
        result = ontology.query(q2)
        for res in result:
            results.append(str(res[0]))
        
        possible_types[(subj_type,obj_type)] = results
        return results
    return possible_types[(subj_type,obj_type)]

In [26]:
entities_and_type = {}
relations = []
triples = []
triple_properties=[]
# Process the Knowledge Graph
g = rdflib.Graph()
g.parse('data/complete.nt', format='nt')
total = []
for s, p, o in g:
    total.append(str(s)+", "+ str(p) + ", "+ str(o))

    if str(p) != "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
        if not str(s) in entities_and_type.keys():
            entities_and_type[(str(s))] =[]
        if not str(p) in relations:
            relations.append(str(p))

        if str(o).find('^^') == -1:
            if not str(o) in entities_and_type.keys():
                entities_and_type[str(o)]=[]
            triples.append((s,p,o))
        else:
            triple_properties.append((str(s),str(p),str(o)))
        
    else:
        if str(s) not in entities_and_type.keys():
            entities_and_type[str(s)] =[]
        
        split_o = str(o).split('/')
        entities_and_type[str(s)].append(split_o[len(split_o)-1])
for e in entities_and_type:
    entities_and_type[e].sort()

In [27]:
def disambiguate_multiple_types(s,p,o): 
    
    for subtype_subj in entities_and_type[str(s)]:

        if len(entities_and_type[str(o)]) > 1:
            for subtype_obj in entities_and_type[str(o)]:
                possible_rels = get_possible_types( subtype_subj, subtype_obj)
                if len(possible_rels) == 0:
                    return ("","")    
                for rel in possible_rels:
                    if rel == p:
                        return (subtype_subj, subtype_obj)
        else:
            possible_rels = get_possible_types( subtype_subj, entities_and_type[str(o)][0])
            if len(possible_rels) == 0:
                    return ("","")    
            for rel in possible_rels:
                if rel == p:
                    return (subtype_subj, subtype_obj)
        
    return ("","")    

In [32]:
new_triples=[]
added_types = []
triples.sort()
for s,p,o in triples:
    if len(entities_and_type[str(s)]) > 1:
        new_subj_type, new_obj_type = disambiguate_multiple_types(s,p,o)
        if(new_subj_type, new_obj_type) == ("",""):
            continue
        #print("news", new_subj_type, "newo", new_obj_type, "sub", s, "obj", o)
        if new_subj_type != "" and new_obj_type != "":
            if s not in added_types:
                new_triples.append((s, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+ new_subj_type[0] ))
                added_types.append(s)
            if o not in added_types:
                new_triples.append((o, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+ new_obj_type[0] ))
                added_types.append(o)
            new_triples.append((new_subj_type, p, new_obj_type))
    else:  
        if s not in added_types:
            new_triples.append((s, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+entities_and_type[str(s)][0] ))
            added_types.append(s)
        if o not in added_types:
            new_triples.append((o, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+entities_and_type[str(o)][0] ))
            added_types.append(o)
        new_triples.append((s, p, o))

new_triples.sort()

http://dbpedia.org/resource/Adam_Sandler ['Actor', 'Person']


AttributeError: can't set attribute

In [None]:
entity_types_count = {}
entities_no_empty = []
for entity in entities_and_type:
    tipo = entities_and_type[entity].lower()
    if tipo != "":
        entity_types_count[tipo] = entity_types_count.get(tipo, 0)+1
        entities_no_empty.append(entity)


# Store entities and relations as dictionaries
_store_dict("entities.dict", entities_no_empty)
_store_dict("relations.dict", relations)

I want to create a dataset with:
- Homo nodes with OHE
- Hetero edges (id for edge types are in a tensor edge_type)

In [None]:
#ONE HOT ENCODER AS DISPATCH TABLE
OHE = {}
for i,key in enumerate(entity_types_count.keys()):
    OHE[key] = [1 if j==i else 0 for j in range(len(entity_types_count.keys()))]

In [None]:
OHE

{'film': [1, 0, 0, 0, 0, 0, 0, 0, 0],
 'director': [0, 1, 0, 0, 0, 0, 0, 0, 0],
 'city': [0, 0, 1, 0, 0, 0, 0, 0, 0],
 'televisionshow': [0, 0, 0, 1, 0, 0, 0, 0, 0],
 'actor': [0, 0, 0, 0, 1, 0, 0, 0, 0],
 'person': [0, 0, 0, 0, 0, 1, 0, 0, 0],
 'productioncompany': [0, 0, 0, 0, 0, 0, 1, 0, 0],
 'country': [0, 0, 0, 0, 0, 0, 0, 1, 0],
 'creativework': [0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [None]:
#entities = _read_dict_as_list("entities.dict")
entities = entities_no_empty

In [None]:
nodes_ohe = {}
for i,entity in enumerate(entities):
    tipo = entities_and_type[str(entity)].lower()
    nodes_ohe[i] = OHE[tipo]

In [None]:
subject_dict = {}
object_dict = {}
for triple in triples:
    s = str(triple[0])
    p = str(triple[1])
    o = str(triple[2])

    if s in entities and o in entities:
        p_type = get_relation_type(p)
        s_type = entities_and_type[s].lower()
        o_type = entities_and_type[o].lower()

        if(s_type != "" and o_type != ""):
            key_t = (s_type, p_type, o_type)
            if key_t in list(subject_dict.keys()):
                subject_dict[key_t].append(entities.index(str(s)))
                object_dict[key_t].append(entities.index(str(o)))
            else:
                subject_dict[key_t] = [entities.index(str(s))]
                object_dict[key_t] = [entities.index(str(o))]

         
        #data[s_type, p_type, o_type].edge_index[0].append(entities.index(str(s)))
        #data[s_type, p_type, o_type].edge_index[1].append(entities.index(str(o)))

In [None]:
data = Data()
#data.x = torch.Tensor([[1] for i in range(len(entities_no_empty))])
#data.x = torch.Tensor([node_feature for i,node_feature in sorted(nodes_ohe.items())])
"""Feature strutturali"""
data.x = torch.Tensor([[] for i in range(len(entities_no_empty))])

In [None]:
data

Data(x=[12225, 9])

In [None]:
lol = [[],[]]
edge_type_name = []
edge_type = []
num_rel=0
for triple in subject_dict.keys():
    lol[0].extend(subject_dict[triple])
    lol[1].extend(object_dict[triple])
    edge_type_name.extend([triple[1] for j in range(len(subject_dict[triple]))])
    edge_type.extend([num_rel for j in range(len(subject_dict[triple]))])
    num_rel+=1

In [None]:
data.edge_index = torch.Tensor(lol).long()
data.edge_type = torch.Tensor(edge_type)

In [None]:
data

Data(x=[12225, 9], edge_index=[2, 20573], edge_type=[20573])

In [None]:
from torch_geometric.transforms import LocalDegreeProfile

transform = LocalDegreeProfile()
data = transform(data)

In [None]:
data

Data(x=[12225, 14], edge_index=[2, 20573], edge_type=[20573])

# R-GCN Implementation

In [None]:
from torch_geometric.nn import RGCNConv

class RGCN(torch.nn.Module):
    def __init__(self,num_rel):
        super().__init__()
        self.conv1 = RGCNConv(data.num_features,4,num_rel) #num_rel := number of edge_types
        self.conv2 = RGCNConv(4,2,num_rel)
        
    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()
        
    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x,edge_index,edge_type)
        x = x.relu()
        x = self.conv2(x,edge_index,edge_type)
        return x

# Preprocessing

In [None]:
#Perform random link split
from torch_geometric.transforms import RandomLinkSplit

link_split = RandomLinkSplit(num_val=0.0,num_test=0.25)
train_link, val_link, test_link = link_split(data)

In [None]:
from random import randint

In [None]:
edges_in_test = list([(int(z),int(w)) for z, w in 
                zip(test_link.edge_label_index[0][:test_link.edge_label_index.size(1)//2],
                                                  test_link.edge_label_index[1][:test_link.edge_label_index.size(1)//2])])

In [None]:
edges_in_data = [(int(x),int(y)) for x,y in zip(data.edge_index[0],data.edge_index[1])]

In [None]:
index_of_edges_in_test = [edges_in_data.index(edges_in_test[i]) for i in range(len(edges_in_test))]

In [None]:
edge_type = train_link.edge_type
edge_type_neg = torch.Tensor([randint(0,num_rel-1) for i in range(train_link.edge_label.size(0)//2)])
train_link.edge_label_type = torch.cat((edge_type,edge_type_neg))

In [None]:
edge_type = data.edge_type[index_of_edges_in_test]
edge_type_neg = torch.Tensor([randint(0,num_rel-1) for i in range(test_link.edge_label.size(0)//2)])
test_link.edge_label_type = torch.cat((edge_type,edge_type_neg))

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
rel_weight = torch.nn.Parameter(torch.randn(num_rel, 2))

In [None]:
model = RGCN(num_rel)
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion =  torch.nn.BCEWithLogitsLoss() #change loss function

def train_linkpre():
    
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_link.x, train_link.edge_index, train_link.edge_type)  # Perform a single forward pass.
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[train_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[train_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    #out_sim = out_src * out_dst #dotproduct
    #pred = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #pred = torch.sum(out_sim,dim=-1)
    
    # 3 - DistMult
    pred = torch.sum(out_src * rel_weight[train_link.edge_label_type.long()]* out_dst, dim=-1)
    
    
    loss = criterion(pred, train_link.edge_label.type_as(pred)) 
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test_linkpre(test_link):
    model.eval()
    out = model(test_link.x, test_link.edge_index, test_link.edge_type)
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[test_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[test_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    #out_sim = out_src * out_dst
    #h = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #h = torch.sum(out_sim,dim=-1)
    
    # 3 - DistMult
    h = torch.sum(out_src * rel_weight[test_link.edge_label_type.long()]* out_dst, dim=-1)
    
    pred_cont = torch.sigmoid(h).cpu().detach().numpy()
    
    # EVALUATION
    test_label = test_link.edge_label.cpu().detach().numpy() #retrieve test set labels
    test_roc_score = roc_auc_score(test_label, pred_cont) #comput AUROC score for test set
    
    return test_roc_score


for epoch in range(1, 801):
    loss = train_linkpre()
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
print('Training end')

Training end


In [None]:
roc_train = test_linkpre(train_link)
roc_test = test_linkpre(test_link)
print(f'Train AUROC: {roc_train:.4f}\nTest AUROC: {roc_test:.4f}')

Train AUROC: 0.9210
Test AUROC: 0.8249
