In [1]:
import rdflib
from rdflib import URIRef
from rdflib.namespace import OWL, RDF, RDFS,XSD, Namespace
import csv
from torch_geometric.data import HeteroData
import torch
import numpy as np
#import tensorflow as tf pytorch tensor != tf tensor

In [2]:
prefix = "https://dbpedia.org/ontology/"
possible_types = {}

In [3]:
def get_type(relation):
    r_split = relation.split("/")
    return r_split[len(r_split)-1]

In [4]:
def get_property_type(property):
    split_p = property.split("^^")
    p_type = str(split_p[1].split("#")[1]).lower()
    
    if p_type.startswith("xsd:integer"):
        return("Integer", split_p[0])
    if p_type.startswith("xsd:string"):
        return("String", split_p[0])
    if p_type.startswith("xsd:double"):
        return("Double", split_p[0])
    if p_type.startswith("xsd:gYear"):
        return("Year",split_p[0])
    if p_type.startswith("xsd:date"):
        return("Date",split_p[0])
    return ("","")

In [5]:
ontology = rdflib.Graph()
ontology.parse('/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/data/external/ontologia.ttl', format='ttl')

<Graph identifier=Nfb07014494254c1b85e8b2816d76338b (<class 'rdflib.graph.Graph'>)>

In [6]:
ontology.bind("dbo", Namespace("http://dbpedia.org/ontology/"))
ontology.bind("dbr", Namespace("http://dbpedia.org/resource/"))
ontology.bind("rdfs", Namespace("http://www.w3.org/2000/01/rdf-schema#"))
ontology.bind("owl", Namespace("http://www.w3.org/2002/07/owl#"))
ontology.bind("rdf", Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))

In [15]:
def get_possible_types(subj_type, obj_type):
    if (subj_type,obj_type) not in possible_types:
        q = "SELECT DISTINCT ?property WHERE {"+\
        "{ ?property rdfs:domain dbo:"+subj_type+". ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+subj_type +" rdfs:subClassOf ?superclass. dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 ."+\
        "  ?property rdfs:domain ?superclass . ?property rdfs:range ?superclass2 "+\
        "} }"
        result = ontology.query(q)
        results = []
        for res in result:
            results.append(str(res[0]))
        
        q2 = "SELECT DISTINCT ?property WHERE {"+\
        "{dbo:"+subj_type +" rdfs:subClassOf ?superclass. "+\
        " ?property rdfs:domain ?superclass . ?property rdfs:range dbo:"+obj_type+\
        " .} UNION {dbo:"+obj_type +" rdfs:subClassOf  ?superclass2 . ?property rdfs:domain dbo:"+\
        subj_type+" . ?property rdfs:range ?superclass2}}"
        
        result = ontology.query(q2)
        for res in result:
            results.append(str(res[0]))
        
        possible_types[(subj_type,obj_type)] = results
        return results
    return possible_types[(subj_type,obj_type)]

In [17]:
def disambiguate_multiple_types(entities_and_type, s,p,o): 
    
    for subtype_subj in entities_and_type[str(s)]:

        if len(entities_and_type[str(o)]) > 1:
            for subtype_obj in entities_and_type[str(o)]:
                possible_rels = get_possible_types( subtype_subj, subtype_obj)
                if len(possible_rels) == 0:
                    return ("","")    
                for rel in possible_rels:
                    if rel == p:
                        return (subtype_subj, subtype_obj)
        else:
            possible_rels = get_possible_types( subtype_subj, entities_and_type[str(o)][0])
            if len(possible_rels) == 0:
                    return ("","")    
            for rel in possible_rels:
                if rel == p:
                    return (subtype_subj, subtype_obj)
        
    return ("","")    

In [18]:
def get_class_from_property():
    Q = " SELECT ?property ?class WHERE {?property rdfs:domain ?class; rdf:type owl:DatatypeProperty. }"
    results = {}
    result = ontology.query(Q)
    for res in result:
        results[get_type(str(res[0]))]= get_type(str(res[1]))
    return results

In [19]:
def get_classes_types(prop_and_type):
    class_from_property = get_class_from_property()
    new_properties_and_types = {}
    for s in list(prop_and_type.keys()):
        for element in prop_and_type[s]:
            prop = element[0] 
            p_type = element[1]
            p_value = element[2] 
            
            if prop in class_from_property:
                if s not in new_properties_and_types:
                    new_properties_and_types[s] = []
                new_properties_and_types[s].append((class_from_property[prop], prop, p_type, p_value))
    
    return new_properties_and_types

In [30]:
entities_and_type = {}
properties_and_types ={}
relations = []
triples = []
use_properties = True
# Process the Knowledge Graph
g = rdflib.Graph()
g.parse("/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/data/external/prova_c.nt", format='nt')
for s, p, o in g:
    str_s = str(s)
    str_p = str(p)
    str_o = str(o)
    if str_p != str(RDF.type):
        if not str_s in entities_and_type.keys():
            entities_and_type[(str_s)] =[]
        if not str_p in relations:
            relations.append(str_p)

        if str_o.find('^^') == -1:
            if not str_o in entities_and_type.keys():
                entities_and_type[str_o]=[]
            triples.append((str_s,str_p,str_o))
        else:
            if use_properties:
                if str_s not in properties_and_types.keys():
                    properties_and_types[str_s] =[]
                p_type, p_value = get_property_type(str_o)
                if (str_s,p_type, p_value) not in properties_and_types[str_s]:
                    properties_and_types[str_s].append((get_type(str_p), p_type, p_value))
                triples.append((str_s,str_p,str_o))
    else:
        if str_s not in entities_and_type.keys():
            entities_and_type[str_s] =[]
        triples.append((str_s,str_p,str_o))
        split_o = str_o.split('/')
        entities_and_type[str_s].append(split_o[len(split_o)-1])

for e in entities_and_type:
    entities_and_type[e].sort()

properties_and_types = get_classes_types(properties_and_types)


In [31]:
new_triples = []
added_types = []

for s,p,o in triples:
    s1 = str(s)
    p1 = str(p)
    o1 = str(o)
    insert_type = True

    if p != str(RDF.type):
        if str(s) in list(properties_and_types.keys()):
            #x = properties_and_type[str(s)]
            new_triples.append((s, p, o))
        if str(s) in list(entities_and_type.keys()) and str(o) in list(entities_and_type.keys()):
            if len(entities_and_type[str(s)]) > 1:
                new_subj_type, new_obj_type = disambiguate_multiple_types(entities_and_type,s,p,o)
                if(new_subj_type, new_obj_type) == ("",""):
                    continue
                #print("news", new_subj_type, "newo", new_obj_type, "sub", s, "obj", o)
                if new_subj_type != "" and new_obj_type != "":
                    if s not in added_types:
                        new_triples.append((s, str(RDF.type),prefix+ new_subj_type[0] ))
                        added_types.append(s)
                    if o not in added_types:
                        new_triples.append((o,str(RDF.type),prefix+ new_obj_type[0] ))
                        added_types.append(o)
                    new_triples.append((new_subj_type, p, new_obj_type))
                insert_type = False
            else: 
                if s not in added_types:
                    new_triples.append((s, str(RDF.type),prefix+entities_and_type[str(s)][0] ))
                    added_types.append(s)
                if o not in added_types and str(o).find("^^") == -1:
                    new_triples.append((o, str(RDF.type),prefix+entities_and_type[str(o)][0] ))
                    added_types.append(o)
                new_triples.append((s, p, o))
            insert_type = False
    else:
        if insert_type:
            new_triples.append((s, p, o))
            insert_type = True

In [33]:
entity_types_count = {}
property_types_count = {}
entities = []
for entity in entities_and_type.keys():
    tipo = entities_and_type[entity][0]
    if tipo != "":
        entity_types_count[tipo] = entity_types_count.get(tipo, 0)+1
        entities.append(entity)

for subj in properties_and_types.keys():
    for class_name, prop_name, prop_type, prop_value in properties_and_types[subj]:
        property_types_count[(class_name, subj, prop_name, prop_type)] = property_types_count.get((class_name, subj, prop_name,prop_type), 0)+1


In [34]:
subject_dict = {}
object_dict = {}

index_dict = {t:{'count': 0} for t in entity_types_count.keys()}

for class_name, subject,rel, p_type in property_types_count.keys():
    index_dict[p_type] = {'count':0}
    if class_name not in index_dict.keys():
        index_dict[class_name] = {'count':0}  
        
new_triples.sort()
for triple in new_triples:
    s = str(triple[0])
    p = str(triple[1])
    o = str(triple[2])
    type_triples = []
    s_type = entities_and_type[s][0] 
    
    if p != str(RDF.type):
        if o.find("^^") == -1:
            p_type = get_type(p)
            o_type = entities_and_type[o][0]
            type_triples.append((s_type,p_type, o_type))

        else: 
            for properties in properties_and_types[s]:
                s_type = properties[0]
                p_type = get_type(properties[1])
                o_type = properties[2]
                type_triples.append((s_type,p_type, o_type))

        for s_type,p_type,o_type in type_triples:
            if(s_type != "" and o_type != ""):
                key_t = (s_type, p_type, o_type)
                
                if key_t not in subject_dict.keys():
                    subject_dict[key_t] = []
                    object_dict[key_t] = []
                    
                if str(s) not in index_dict[s_type]:
                    index_dict[s_type][str(s)] = index_dict[s_type]['count']
                    index_dict[s_type]['count'] = index_dict[s_type]['count']+1
                s_index = index_dict[s_type][str(s)]
                    
                if str(o) not in index_dict[o_type]:
                    index_dict[o_type][str(o)] = index_dict[o_type]['count']
                    index_dict[o_type]['count'] = index_dict[o_type]['count']+1
                o_index = index_dict[o_type][str(o)]
                    
                subject_dict[key_t].append(s_index)
                object_dict[key_t].append(o_index)
        

In [36]:
from langdetect import detect
from dateutil.parser import parse
import datetime, string

def function_build_feature(p_type, value):
    #return [5] così funziona perchè è numerico
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))

    #aggiungere funzione x riconscere le date
    if p_type == 'Integer':
        try: i = int(value) 
        except: i = 0
        return [i]
    if p_type == 'Double':
        try: d = float(value)
        except: d = float(0.0)
        return [d]
    if p_type == 'gYear':
        return [int(1970-value)]
    if p_type == 'String':
        a_punct = count(value, string.punctuation)
        lang = 0
        try:
            if detect(value) == 'en': lang = 1
        except:
            lang = 0
        return [len(value), value.count(" ") , value.count("(") + value.count(")"), lang, a_punct]
    if p_type == 'Date':
        return [(parse(value) - datetime.datetime(1970,1,1)).days]
    return ""

In [37]:
complete_data = HeteroData()

data_to_insert = {}
for subj in list(properties_and_types.keys()):
    for class_type, prop_name, prop_type, prop_value in properties_and_types[subj]:
        if prop_type not in data_to_insert:
            data_to_insert[prop_type] = []
        else:
            for i in range(property_types_count[(class_type, subj, prop_name, prop_type)]):
                data_to_insert[prop_type].append(function_build_feature(prop_type, prop_value))


types = list(entity_types_count.keys())
for t in types:
    data_to_insert[t] = [[1] for i in range(entity_types_count[t])]

for key in data_to_insert.keys():
    lists = data_to_insert[key]
    if lists != '':
        complete_data[key].x = torch.Tensor(lists)

#property_types_count[(property, prop_name,prop_type)] 
#properties_and_types[str(s)].append((str(p), p_type, p_value))

for triple in subject_dict.keys():
    lol = [subject_dict[triple], object_dict[triple]]
    complete_data[triple[0], triple[1], triple[2]].edge_index = torch.Tensor(lol).long()

In [38]:
from torch_geometric.nn import SAGEConv, to_hetero, GATConv

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        #Sto definendo quale tipologia di layer voglio usare.
        self.conv1 = GATConv((-1, -1), hidden_channels)
        self.conv2 = GATConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        #descrive la computazione dall'input all'output.
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=4, out_channels=2)
model = to_hetero(model, complete_data.metadata(), aggr='sum')


In [40]:
edge_types = list(complete_data.edge_index_dict.keys())

In [28]:
torch.save(model.state_dict(), '/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/src/models/model_notebook.pth')

In [19]:
#model.load_state_dict(torch.load('/home/sara/Desktop/fase2/git_repo/knowledge-graph-learning/model_weights.pth'))


<All keys matched successfully>

In [41]:
from sklearn.metrics import roc_auc_score

def test_hetlinkpre(test_link):
    model.eval()
    out = model(test_link.x_dict, test_link.edge_index_dict)
    
    ### LINK PREDICTION ACTS HERE ###
    
    hs = torch.Tensor()
    edge_labels = np.array([])
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in edge_types:
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][test_link[edge_t].edge_label_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][test_link[edge_t].edge_label_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        h = torch.sum(out_sim, dim=-1)
        hs = torch.cat((hs,h),-1)
        edge_labels = np.concatenate((edge_labels,test_link[edge_t].edge_label.cpu().detach().numpy()))
    
    
    pred_cont = torch.sigmoid(hs).cpu().detach().numpy()
    
    # EVALUATION
    test_roc_score = roc_auc_score(edge_labels, pred_cont) #comput AUROC score for test set
    
    return test_roc_score

In [42]:
pred_percentage = test_hetlinkpre(complete_data)
print("pred_percentage",pred_percentage)

AttributeError: 'NoneType' object has no attribute 'dim'