In [5]:
from gensim.models import KeyedVectors
from os import path
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np
from torch_geometric.data import Data

In [6]:
embedded_relations = KeyedVectors.load_word2vec_format(
        "embeddings/relation_embedding_le_FB15k_237_1024.bin",
        binary=True,
    )

In [7]:
sorted_embedding = []
for i in range(0, len(embedded_relations.vectors)):
    sorted_embedding.append(embedded_relations.get_vector(str(i)))
embedded_relations = torch.tensor(sorted_embedding, dtype=torch.float)

In [8]:
le_relation = LabelEncoder()
le_relation.classes_ = np.load(path.join('embeddings','le_relation_classes.npy'), allow_pickle=True)


## Evaluating Graph Star Multi Relational

In [9]:
model = torch.load("output/FB15k_237.pkl")

In [10]:
model

GraphStar(
  (fl): Linear(in_features=16, out_features=1024, bias=True)
  (star_init): StarAttn(
    (Wq): Linear(in_features=1024, out_features=1024, bias=True)
    (Wk): Linear(in_features=1024, out_features=1024, bias=True)
    (Wv): Linear(in_features=1024, out_features=1024, bias=True)
    (sLayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (conv_list): ModuleList(
    (0): GraphStarConv(1024, 1024, heads=4)
    (1): GraphStarConv(1024, 1024, heads=4)
    (2): GraphStarConv(1024, 1024, heads=4)
  )
  (star_attn_list): ModuleList(
    (0): StarAttn(
      (Wq): Linear(in_features=1024, out_features=1024, bias=True)
      (Wk): Linear(in_features=1024, out_features=1024, bias=True)
      (Wv): Linear(in_features=1024, out_features=1024, bias=True)
      (sLayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (1): StarAttn(
      (Wq): Linear(in_features=1024, out_features=1024, bias=True)
      (Wk): Linear(in_features=1024, out_features

In [11]:
# Read data and make dataframes
name = ['entity', 'id']
entity_id = pd.read_csv('./data/FB15k/entities.txt', sep='\t', header=None, names=name, engine='python')
all_entities = entity_id['entity'].values

name = ['relation', 'id']
relation_id = pd.read_csv('./data/FB15k/relations.txt', sep='\t', header=None, names=name, engine='python')
all_relations = relation_id['relation'].values

# Read RDF Triples
name = ['head', 'relation', 'tail']
# concat all data 
train = pd.read_csv('./data/FB15k_237/train.txt', sep='\t', header=None, names=name, engine='python')
valid = pd.read_csv('./data/FB15k_237/valid.txt', sep='\t', header=None, names=name, engine='python')
test = pd.read_csv('./data/FB15k_237/test.txt', sep='\t', header=None, names=name, engine='python')
data = pd.concat([train, valid])
data = pd.concat([data, test])

head = data['head'].values
tail = data['tail'].values
relations = data['relation'].values

In [12]:
# fit entity encoder
le_entity = LabelEncoder()
le_entity.fit(all_entities)

# string list to int array using LabelEncoder on complete data set
heads = le_entity.transform(head)
tails = le_entity.transform(tail)
relations = le_relation.transform(relations)

edge_attributes = torch.tensor(relations, dtype=torch.long)
edge_index = torch.tensor([heads, tails], dtype=torch.long)
unique_entities = torch.tensor(np.unique(edge_index.reshape(edge_index.shape[-1]*2, 1)), dtype=torch.float)

dataset = Data(x=unique_entities, edge_type=edge_attributes, edge_index=edge_index)

In [13]:
#!pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

In [316]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def freebase_parser(freebase_id):
    endpoint_url = "https://query.wikidata.org/sparql"

    query = \
    '''SELECT ?sLabel WHERE { 
        ?s wdt:P646 "''' + freebase_id + '''".
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 1'''
    res = get_results(endpoint_url, query)
    if len(res['results']['bindings']) == 0:
        return "No result"
    else:
        return get_results(endpoint_url, query)['results']['bindings'][0]['sLabel']['value']

def common_to_fb(common_name):
    endpoint_url = "https://query.wikidata.org/sparql"

    query = '''
    SELECT ?fbid WHERE { 
        ?s wdt:P373 "''' + common_name + '''".
    OPTIONAL {
        ?s wdt:P646 ?fbid .
        }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 1'''
    res = get_results(endpoint_url, query)
    try:
        res = res['results']['bindings'][0]['fbid']['value']
        return res
    except:
        return common_name + "-1"
    
def inverse_transform(head, tail, rel):
    head = le_entity.inverse_transform([head])
    tail = le_entity.inverse_transform([tail])
    rel = le_relation.inverse_transform([rel])
    return head[0], tail[0], rel[0]


def rdf2txt(head, tail, rel):
    head, tail, rel = inverse_transform(head, tail, rel)
    head = freebase_parser(head)
    tail = freebase_parser(tail)
    return head, tail, str(rel)

In [180]:
def pred(head=True, index=0, sig=True, emb_sig=True):
    emb = model.z
    edge_index = dataset.edge_index.T[index].T
    edge_type = dataset.edge_type[index]
    
    h, t, r = rdf2txt(edge_index[0], edge_index[1], edge_type)
    relation =  model.RW[edge_type]
    if head:
        out_text = "predicted heads:"
        head = emb
        tail = emb[edge_index[1]]
    else:
        out_text = "predicted tails:"
        head = emb[edge_index[0]]
        tail = emb
    p = head*relation*tail

    score = p.sum(dim=1)
    
    pred = score.detach().numpy().argsort()[-10:][::-1]
    
    print(f"Original h:{h}, r: {r}, t:{t}")
    print(out_text)
    for l in pred:
        h, t, r = rdf2txt(edge_index[0], l, edge_type)
        print (t)


In [181]:
pred(head=True, index=5243, sig=True, emb_sig=False)

Original h:Joe Biden, r: /people/person/profession, t:politician
predicted heads:
Holy Roman Emperor
Hanover
Transformers
The Artist
Tom Wolfe
The Bold and the Beautiful
Jamey Sheridan
Lincoln City F.C.
Grammy Award for Best Score Soundtrack for Visual Media
House of Lords


In [182]:
# Logits embedding [13292, 256] -> [unique nodes x hidden layer]
z = model.z

def experiment(head_pred=False, index=0):
    
    # Edge index between 2 nodes
    edge_index = dataset.edge_index.T[index].T
    # relation type
    edge_type = dataset.edge_type[index]

    h, t, r = rdf2txt(edge_index[0], edge_index[1], edge_type)
    print(
    f"""
    Original data:
    Head: {h}
    Relation: {r}
    Tail: {t}
    """)
    
    
    relation = model.RW[edge_type]
    if head_pred:
        out_text = "Predicted data (head pred)"
        head = z
        tail = z[edge_index[1]]
    else:
        out_text = "Predicted data (tail pred)"
        head = z[edge_index[0]]
        tail = z    
    
    p = head * relation * tail
    summed = p.sum(dim=1).detach().numpy()

    if head_pred:
        target = summed[edge_index[0]]
        raw_rank = np.where(summed == target)
        pred = summed.argsort()[-1]
        h, t, r = rdf2txt(pred, edge_index[1], edge_type)

    else:
        target = summed[edge_index[1]]
        raw_rank = np.where(summed == target)
        pred = summed.argsort()[-1]
        h, t, r = rdf2txt(edge_index[0], pred, edge_type)
    print(
    f""" 
    {out_text}:  
    Head: {h}  
    Relation: {r}  
    Tail: {t}  
    Rank of target: {raw_rank[0][0]}
    """)

In [183]:
experiment(head_pred=True, index=5243)


    Original data:
    Head: Joe Biden
    Relation: /people/person/profession
    Tail: politician
    
 
    Predicted data (head pred):  
    Head: Holy Roman Emperor  
    Relation: /people/person/profession  
    Tail: politician  
    Rank of target: 134
    


# LIAR DATASET CROSS PRED WITH FB15K

In [184]:
from glob import glob
paths = glob("./data/LIAR/*")

In [185]:
train = pd.read_csv(paths[-1])
valid = pd.read_csv(paths[-2])

In [186]:
train = train.dropna()

In [187]:
train = train.triple.values

In [188]:
clean_train = []

for trip in train:
    clean_train.append(eval(trip))

In [310]:
import requests 
import urllib
from bs4 import BeautifulSoup

# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"

def google(query):
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    
    headers = {"user-agent" : USER_AGENT}
    resp = requests.get(URL, headers=headers)
    
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")
    for g in soup.find_all("div", {"class": "g"}):
        titles = g.find_all("h3")
        if titles:
            text = titles[0].text
            if "Wikipedia" in text:
                return text[:-12]
    return "No result"

In [313]:
google("Gov.")

'.gov'

In [307]:
common_to_fb(google(clean_train[0][0][0]))

'/m/02mjmr'

In [308]:
freebase_parser("/m/02mjmr")

'Barack Obama'

In [319]:
from tqdm import tqdm
df = pd.DataFrame(columns=name)

for trip_list in tqdm(clean_train):
    for trip in trip_list:
        df.append(
            {"head": common_to_fb(google(trip[0])),
             "relation": trip[1],
             "tail" : common_to_fb(google(trip[2]))
                  }, ignore_index=True)

  2%|█▋                                                                                | 7/347 [00:23<19:10,  3.38s/it]


KeyboardInterrupt: 

In [None]:
df