In [46]:
import rdflib
from rdflib import URIRef
from rdflib.namespace import OWL, RDF, RDFS,XSD, Namespace
import csv
from torch_geometric.data import HeteroData
import torch
import numpy as np
#import tensorflow as tf pytorch tensor != tf tensor

In [47]:
prefix = "https://dbpedia.org/ontology/"
possible_types = {}

In [48]:
def _store_dict(dict_path, list_to_store):
    with open(dict_path, 'wt') as file:
        writer = csv.writer(file, delimiter='\t')
        for idx, val in enumerate(list_to_store):
            writer.writerow([idx, val]) 

In [49]:
def get_relation_type(relation):
    r_split = relation.split("/")
    return r_split[len(r_split)-1]

In [50]:
ontology = rdflib.Graph()
ontology.parse('data/external/complete.nt', format='ttl')

<Graph identifier=Nf9f17730b4e84b8aa6f20909adbffcb0 (<class 'rdflib.graph.Graph'>)>

In [51]:
ontology.bind("dbo", Namespace("http://dbpedia.org/ontology/"))
ontology.bind("dbr", Namespace("http://dbpedia.org/resource/"))
ontology.bind("rdfs", Namespace("http://www.w3.org/2000/01/rdf-schema#"))
ontology.bind("owl", Namespace("http://www.w3.org/2002/07/owl#"))
ontology.bind("rdf", Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))

In [52]:
def get_possible_types(subj_type, obj_type):
    if (subj_type,obj_type) not in possible_types:
        q = "SELECT DISTINCT ?property WHERE {"+\
        "{ ?property rdfs:domain dbo:"+subj_type+". ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+subj_type +" rdfs:subClassOf ?superclass. dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 ."+\
        "  ?property rdfs:domain ?superclass . ?property rdfs:range ?superclass2 "+\
        "} }"
        result = ontology.query(q)
        results = []
        for res in result:
            results.append(str(res[0]))
        
        q2 = "SELECT DISTINCT ?property WHERE {"+\
        "{dbo:"+subj_type +" rdfs:subClassOf ?superclass. "+\
        " ?property rdfs:domain ?superclass . ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 . ?property rdfs:domain dbo:"+\
        subj_type+" . ?property rdfs:range ?superclass2}}"
        
        result = ontology.query(q2)
        for res in result:
            results.append(str(res[0]))
        
        possible_types[(subj_type,obj_type)] = results
        return results
    return possible_types[(subj_type,obj_type)]

In [53]:
entities_and_type = {}
relations = []
triples = []
triple_properties=[]
# Process the Knowledge Graph
g = rdflib.Graph()
g.parse('data/external/complete.nt', format='nt')
total = []
for s, p, o in g:
    total.append(str(s)+", "+ str(p) + ", "+ str(o))

    if str(p) != "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
        if not str(s) in entities_and_type.keys():
            entities_and_type[(str(s))] =[]
        if not str(p) in relations:
            relations.append(str(p))

        if str(o).find('^^') == -1:
            if not str(o) in entities_and_type.keys():
                entities_and_type[str(o)]=[]
            triples.append((s,p,o))
        else:
            triple_properties.append((str(s),str(p),str(o)))
        
    else:
        if str(s) not in entities_and_type.keys():
            entities_and_type[str(s)] =[]
        
        split_o = str(o).split('/')
        entities_and_type[str(s)].append(split_o[len(split_o)-1])
for e in entities_and_type:
    entities_and_type[e].sort()

In [54]:
def disambiguate_multiple_types(s,p,o): 
    
    for subtype_subj in entities_and_type[str(s)]:

        if len(entities_and_type[str(o)]) > 1:
            for subtype_obj in entities_and_type[str(o)]:
                possible_rels = get_possible_types( subtype_subj, subtype_obj)
                if len(possible_rels) == 0:
                    return ("","")    
                for rel in possible_rels:
                    if rel == p:
                        return (subtype_subj, subtype_obj)
        else:
            possible_rels = get_possible_types( subtype_subj, entities_and_type[str(o)][0])
            if len(possible_rels) == 0:
                    return ("","")    
            for rel in possible_rels:
                if rel == p:
                    return (subtype_subj, subtype_obj)
        
    return ("","")    

In [55]:
new_triples=[]
added_types = []
triples.sort()
for s,p,o in triples:
    if len(entities_and_type[str(s)]) > 1:
        new_subj_type, new_obj_type = disambiguate_multiple_types(s,p,o)
        if(new_subj_type, new_obj_type) == ("",""):
            continue
        #print("news", new_subj_type, "newo", new_obj_type, "sub", s, "obj", o)
        if new_subj_type != "" and new_obj_type != "":
            if s not in added_types:
                new_triples.append((s, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+ new_subj_type[0] ))
                added_types.append(s)
            if o not in added_types:
                new_triples.append((o, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+ new_obj_type[0] ))
                added_types.append(o)
            new_triples.append((new_subj_type, p, new_obj_type))
    else:  
        if s not in added_types:
            new_triples.append((s, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+entities_and_type[str(s)][0] ))
            added_types.append(s)
        if o not in added_types:
            new_triples.append((o, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",prefix+entities_and_type[str(o)][0] ))
            added_types.append(o)
        new_triples.append((s, p, o))

new_triples.sort()

KeyboardInterrupt: 

In [None]:
entity_types_count = {}
entities = []
for entity in entities_and_type.keys():
    tipo = entities_and_type[entity][0]
    if tipo != "":
        entity_types_count[tipo] = entity_types_count.get(tipo, 0)+1
        entities.append(entity)

#_store_dict("entities.dict", entities)
#_store_dict("relations.dict", relations)

In [None]:
subject_dict = {}
object_dict = {}

index_dict = {t:{'count': 0} for t in entity_types_count.keys()}
new_triples.sort()
for triple in new_triples:
    s = str(triple[0])
    p = str(triple[1])
    o = str(triple[2])

    if s in entities and o in entities:
        p_type = get_relation_type(p)
        s_type = entities_and_type[s][0]
        o_type = entities_and_type[o][0]

        if(s_type != "" and o_type != ""):
            key_t = (s_type, p_type, o_type)
            
            if key_t not in subject_dict.keys():
                subject_dict[key_t] = []
                object_dict[key_t] = []
                
            if str(s) not in index_dict[s_type]:
                index_dict[s_type][str(s)] = index_dict[s_type]['count']
                index_dict[s_type]['count'] = index_dict[s_type]['count']+1
            s_index = index_dict[s_type][str(s)]
                
            if str(o) not in index_dict[o_type]:
                index_dict[o_type][str(o)] = index_dict[o_type]['count']
                index_dict[o_type]['count'] = index_dict[o_type]['count']+1
            o_index = index_dict[o_type][str(o)]
                
            subject_dict[key_t].append(s_index)
            object_dict[key_t].append(o_index)
         
        #data[s_type, p_type, o_type].edge_index[0].append(entities.index(str(s)))
        #data[s_type, p_type, o_type].edge_index[1].append(entities.index(str(o)))

In [None]:
data = HeteroData()
types = list(entity_types_count.keys())
for t in types:
    data[t].x = torch.Tensor([[1] for i in range(entity_types_count[t])])

In [None]:
for triple in subject_dict.keys():
    lol = [subject_dict[triple], object_dict[triple]]
    data[triple[0], triple[1], triple[2]].edge_index = torch.Tensor(lol).long()

In [None]:
data

HeteroData(
  [1mTelevisionShow[0m={ x=[1107, 1] },
  [1mDirector[0m={ x=[2451, 1] },
  [1mProductionCompany[0m={ x=[717, 1] },
  [1mActor[0m={ x=[7930, 1] },
  [1mCity[0m={ x=[1515, 1] },
  [1mFilm[0m={ x=[1660, 1] },
  [1mPerson[0m={ x=[1365, 1] },
  [1mCreativeWork[0m={ x=[2209, 1] },
  [1mCountry[0m={ x=[338, 1] },
  [1mLocation[0m={ x=[2, 1] },
  [1m(Actor, bornIn, City)[0m={ edge_index=[2, 3811] },
  [1m(Actor, starring, Film)[0m={ edge_index=[2, 7245] },
  [1m(TelevisionShow, createdBy, Director)[0m={ edge_index=[2, 1700] },
  [1m(Film, cinematography, Person)[0m={ edge_index=[2, 1025] },
  [1m(Film, directedBy, Director)[0m={ edge_index=[2, 1321] },
  [1m(Film, producedBy, ProductionCompany)[0m={ edge_index=[2, 1840] },
  [1m(Film, editing, Person)[0m={ edge_index=[2, 716] },
  [1m(ProductionCompany, headquarter, Country)[0m={ edge_index=[2, 142] },
  [1m(ProductionCompany, headquarter, City)[0m={ edge_index=[2, 353] },
  [1m(Film, directe

In [None]:
edge_types = list(data.edge_index_dict.keys())

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero, GATConv

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels)
        self.conv2 = GATConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=4, out_channels=2)
model = to_hetero(model, data.metadata(), aggr='sum')

torch.save(model.state_dict(), 'model_weights.pth')

  f"There exist node types ({unused_node_types}) whose "


In [None]:
from torch_geometric.transforms import RandomLinkSplit

link_split = RandomLinkSplit(num_val=0.0,
                             num_test=0.25,
                             edge_types=edge_types,
                             rev_edge_types=[None]*len(edge_types))
train_link, val_link, test_link = link_split(data)

In [None]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict,data.edge_index_dict)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion =  torch.nn.BCEWithLogitsLoss() #change loss function

def train_hetlinkpre():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_link.x_dict, train_link.edge_index_dict)  # Perform a single forward pass.
    preds = torch.Tensor()
    edge_labels = torch.Tensor()
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in edge_types:
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][train_link[edge_t].edge_label_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][train_link[edge_t].edge_label_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        pred = torch.sum(out_sim, dim=-1)
        
        preds = torch.cat((preds,pred),-1)
        edge_labels = torch.cat((edge_labels,train_link[edge_t].edge_label.type_as(pred)),-1)
    
        
    #compute loss function based on all edge types
    loss = criterion(preds, edge_labels)
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

In [None]:
from sklearn.metrics import roc_auc_score

def test_hetlinkpre(test_link):
    model.eval()
    out = model(test_link.x_dict, test_link.edge_index_dict)
    
    ### LINK PREDICTION ACTS HERE ###
    
    hs = torch.Tensor()
    edge_labels = np.array([])
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in edge_types:
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][test_link[edge_t].edge_label_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][test_link[edge_t].edge_label_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        h = torch.sum(out_sim, dim=-1)
        
        hs = torch.cat((hs,h),-1)
        edge_labels = np.concatenate((edge_labels,test_link[edge_t].edge_label.cpu().detach().numpy()))
    
    
    pred_cont = torch.sigmoid(hs).cpu().detach().numpy()
    
    # EVALUATION
    test_roc_score = roc_auc_score(edge_labels, pred_cont) #comput AUROC score for test set
    
    return test_roc_score

In [None]:
for epoch in range(1,1001):
    loss = train_hetlinkpre()
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
roc_train = test_hetlinkpre(train_link)
roc_test = test_hetlinkpre(test_link)
print(f'Train AUROC: {roc_train:.4f}\nTest AUROC: {roc_test:.4f}')

Train AUROC: 0.7434
Test AUROC: 0.6782


In [None]:
#torch.save(model.state_dict(), 'model_weights.pt')

In [None]:
def test_hetscores(test_link):
    model.eval()
    out = model(test_link.x_dict, test_link.edge_index_dict)
    
    ### LINK PREDICTION ACTS HERE ###
    
    hs = torch.Tensor()
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in test_link.edge_index_dict.keys():
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][test_link[edge_t].edge_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][test_link[edge_t].edge_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        h = torch.sum(out_sim, dim=-1)
        
        hs = torch.cat((hs,h),-1)
    
    
    pred_cont = torch.sigmoid(hs).cpu().detach().numpy()
    
    return pred_cont

In [None]:
test_data = HeteroData()
relations_weights={}
for triple in data.edge_index_dict.keys():
    for triple2 in data.edge_index_dict.keys():
        test_data[triple2].edge_index = torch.Tensor([[],[]]).long()
        test_data[triple2[0]].x = torch.Tensor([[1]])
        test_data[triple2[2]].x = torch.Tensor([[1]])
    test_data[triple[0]].x = torch.Tensor([[1]])
    test_data[triple[2]].x = torch.Tensor([[1]])
    test_data[triple].edge_index = torch.Tensor([[0],[0]]).long()
    weight = test_hetscores(test_data)[0]
    relations_weights[triple] = weight
    #print(f'{triple}: {relations_weights}')

In [None]:
'''
import networkx as nx
closure = nx.read_edgelist("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/closure", data=(('label',str), ('weight', int)))

#semantic_model = nx.read_edgelist("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/semantic_model", data=(('label',str),))
leafs = []
with open("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/leafs", "r") as f:
    for line in f.readlines():
        leafs.append(line.strip("\n"))


import json
import approximation
from networkx.readwrite import json_graph

def graph_to_json(graph):
    data1 = json_graph.node_link_data(graph)
    s2 = json.dumps(
        data1
    )
    return s2


'''

'\nimport networkx as nx\nclosure = nx.read_edgelist("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/closure", data=((\'label\',str), (\'weight\', int)))\n\n#semantic_model = nx.read_edgelist("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/semantic_model", data=((\'label\',str),))\nleafs = []\nwith open("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/leafs", "r") as f:\n    for line in f.readlines():\n        leafs.append(line.strip("\n"))\n\n\nimport json\nimport approximation\nfrom networkx.readwrite import json_graph\n\ndef graph_to_json(graph):\n    data1 = json_graph.node_link_data(graph)\n    s2 = json.dumps(\n        data1\n    )\n    return s2\n\n\n'