In [1]:
import rdflib
import csv
from torch_geometric.data import HeteroData
import torch
import numpy as np
#import tensorflow as tf pytorch tensor != tf tensor

In [2]:
def _store_dict(dict_path, list_to_store):
    with open(dict_path, 'wt') as file:
        writer = csv.writer(file, delimiter='\t')
        for idx, val in enumerate(list_to_store):
            writer.writerow([idx, val])

In [3]:
def _read_dict_as_list(dict_path):
    with open(dict_path) as tsv:
        list_to_store = []
        reader = csv.reader(tsv, delimiter='\t')
        for row in reader:
            list_to_store.append(row[1])
        return list_to_store

In [4]:
def get_relation_type(relation):
    r_split = relation.split("/")
    return r_split[len(r_split)-1]

In [5]:
entities_and_type = {}
relations = []
triples = []
triple_properties=[]
# Process the Knowledge Graph
g = rdflib.Graph()
g.parse('knowledge_graph.ttl', format='turtle')

for s, p, o in g:
    if str(p) != "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
        if not str(s) in entities_and_type:
            entities_and_type[(str(s))] =""
        if not str(p) in relations:
            relations.append(str(p))
        if not str(o) in entities_and_type:
            if str(o).find('https://') != -1 or str(o).find('http://') != -1:
                entities_and_type[str(o)]=""
            else:
                entities_and_type["property"]=""
                triple_properties.append((str(s),str(p),str(o)))
        
        triples.append((s,p,o))
    else:
        if str(s) in entities_and_type.keys():
            split_o = str(o).split('/')
            entities_and_type[str(s)]= split_o[len(split_o)-1]

In [6]:
entity_types_count = {}
entities_no_empty = []
for entity in entities_and_type:
    tipo = entities_and_type[entity].lower()
    if tipo != "":
        entity_types_count[tipo] = entity_types_count.get(tipo, 0)+1
        entities_no_empty.append(entity)


# Store entities and relations as dictionaries
_store_dict("entities.dict", entities_no_empty)
_store_dict("relations.dict", relations)

In [7]:
len(entities_no_empty)

15267

In [8]:
data = HeteroData()
types = list(entity_types_count.keys())
for t in types:
    data[t].x = torch.Tensor([[1] for i in range(entity_types_count[t])])

In [9]:
#entities = _read_dict_as_list("entities.dict")
entities = entities_no_empty

In [10]:
subject_dict = {}
object_dict = {}

index_dict = {t:{'count': 0} for t in entity_types_count.keys()}

for triple in triples:
    s = str(triple[0])
    p = str(triple[1])
    o = str(triple[2])

    if s in entities and o in entities:
        p_type = get_relation_type(p)
        s_type = entities_and_type[s].lower()
        o_type = entities_and_type[o].lower()

        if(s_type != "" and o_type != ""):
            key_t = (s_type, p_type, o_type)
            
            if key_t not in subject_dict.keys():
                subject_dict[key_t] = []
                object_dict[key_t] = []
                
            if str(s) not in index_dict[s_type]:
                index_dict[s_type][str(s)] = index_dict[s_type]['count']
                index_dict[s_type]['count'] = index_dict[s_type]['count']+1
            s_index = index_dict[s_type][str(s)]
                
            if str(o) not in index_dict[o_type]:
                index_dict[o_type][str(o)] = index_dict[o_type]['count']
                index_dict[o_type]['count'] = index_dict[o_type]['count']+1
            o_index = index_dict[o_type][str(o)]
                
            subject_dict[key_t].append(s_index)
            object_dict[key_t].append(o_index)
         
        #data[s_type, p_type, o_type].edge_index[0].append(entities.index(str(s)))
        #data[s_type, p_type, o_type].edge_index[1].append(entities.index(str(o)))

In [11]:
for triple in subject_dict.keys():
    lol = [subject_dict[triple], object_dict[triple]]
    data[triple[0], triple[1], triple[2]].edge_index = torch.Tensor(lol).long()

In [12]:
edge_types = list(data.edge_index_dict.keys())

In [13]:
from torch_geometric.nn import SAGEConv, to_hetero, GATConv

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels)
        self.conv2 = GATConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=4, out_channels=2)
model = to_hetero(model, data.metadata(), aggr='sum')

In [14]:
from torch_geometric.transforms import RandomLinkSplit

link_split = RandomLinkSplit(num_val=0.0,
                             num_test=0.25,
                             edge_types=edge_types,
                             rev_edge_types=[None]*len(edge_types))
train_link, val_link, test_link = link_split(data)

In [15]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict,data.edge_index_dict)

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion =  torch.nn.BCEWithLogitsLoss() #change loss function

def train_hetlinkpre():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_link.x_dict, train_link.edge_index_dict)  # Perform a single forward pass.
    preds = torch.Tensor()
    edge_labels = torch.Tensor()
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in edge_types:
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][train_link[edge_t].edge_label_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][train_link[edge_t].edge_label_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        pred = torch.sum(out_sim, dim=-1)
        
        preds = torch.cat((preds,pred),-1)
        edge_labels = torch.cat((edge_labels,train_link[edge_t].edge_label.type_as(pred)),-1)
    
        
    #compute loss function based on all edge types
    loss = criterion(preds, edge_labels)
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

In [17]:
from sklearn.metrics import roc_auc_score

def test_hetlinkpre(test_link):
    model.eval()
    out = model(test_link.x_dict, test_link.edge_index_dict)
    
    ### LINK PREDICTION ACTS HERE ###
    
    hs = torch.Tensor()
    edge_labels = np.array([])
    ### LINK PREDICTION ACTS HERE ###
    for edge_t in edge_types:
        #Compute link embedding for each edge type
        #for src in train_link[edge_t].edge_label_index[0]:
        out_src = out[edge_t[0]][test_link[edge_t].edge_label_index[0]]#embedding src nodes
        out_dst = out[edge_t[2]][test_link[edge_t].edge_label_index[1]] #embedding dst nodes
        
        # LINK EMBEDDING #
        # 1 - Dot Product
        out_sim = out_src * out_dst #dotproduct
        h = torch.sum(out_sim, dim=-1)
        
        hs = torch.cat((hs,h),-1)
        edge_labels = np.concatenate((edge_labels,test_link[edge_t].edge_label.cpu().detach().numpy()))
    
    
    pred_cont = torch.sigmoid(hs).cpu().detach().numpy()
    
    # EVALUATION
    test_roc_score = roc_auc_score(edge_labels, pred_cont) #comput AUROC score for test set
    
    return test_roc_score

In [18]:
for epoch in range(1,1001):
    loss = train_hetlinkpre()
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [19]:
roc_train = test_hetlinkpre(train_link)
roc_test = test_hetlinkpre(test_link)
print(f'Train AUROC: {roc_train:.4f}\nTest AUROC: {roc_test:.4f}')

Train AUROC: 0.7537
Test AUROC: 0.7026
