In [1]:
import json, os, requests
import pandas as pd
from collections import defaultdict
from tqdm import tqdm_notebook

In [2]:
def get_wikidata_names(fb_id):
    query = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wikibase: <http://wikiba.se/ontology#>

    SELECT  ?s ?sLabel WHERE {
     ?s wdt:P646 "%s" 
    # . ?prop wikibase:directClaim ?p 

       SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en" .
       }
     }
    """ % fb_id
    endpoint_url = "https://query.wikidata.org/sparql"
    r = requests.get(endpoint_url, params = {'format': 'json', 'query': query})
    data = r.json()
    return data['results']['bindings'][0]['sLabel']['value']

In [3]:
# load relations 
id2relations = {}
with open("relation2id.txt", "r") as in_file:
    for line in in_file:
        items = line.strip().split("\t")
        if len(items) ==2:
            id2relations[items[1]] = items[0]

# load entities 
id2entities = {}
with open("entity2id.txt", "r") as in_file:
    for line in in_file:
        items = line.strip().split("\t")
        if len(items) ==2:
            id2entities[items[1]] = items[0]

In [4]:
# load all triples (train, val, test)
triples = defaultdict(list)
fnames = ['train2id.txt', 'valid2id.txt', 'test2id.txt']
for fname in fnames:
    with open(fname, 'r') as in_file:
        for line in in_file:
            items = line.strip().split(" ")
            if len(items) == 3:
                head, tail, arc = items
                triples[arc].append((head, tail))

In [13]:
sorted_ids = id2relations.keys()
for id in sorted_ids: 
    rel_name = id2relations[id]
    head_id, tail_id = triples[id][0]
    head_fb = id2entities[head_id]
    tail_fb = id2entities[tail_id]
    break

In [31]:
def here():
    dups = []
    for rel_id in tqdm_notebook(id2relations): 
        pairs = triples[rel_id]
        for i in range(len(pairs)):
            rev_pair = (pairs[i][1], pairs[i][0])
            for pair in pairs:
                if pair == rev_pair:
                    dups.append((rel_id,pair, pairs))
    return dups
                
dups = here()

KeyboardInterrupt: 

In [29]:
[get_wikidata_names(id2entities[x]) for x in pairs[3]]

['thriller', 'Gone Baby Gone']

In [19]:
id2relations['2']

'/media_common/netflix_genre/titles'

In [20]:
id2entities['6604']

'/m/039cq4'