## Data Re-index

Previous entity id corresponds to wikipedia page id.

Convert to re-indexed indices based on 'entity.jsonl'

In [49]:
import json
from time import time
import random
from tqdm import tqdm
import string
import numpy as np
import os
import torch
import requests

In [3]:
all_wiki_ents = open("models/entity.jsonl").readlines()

In [4]:
all_wiki_ents = [json.loads(line) for line in all_wiki_ents]

In [5]:
print(len(all_wiki_ents))
all_wiki_ents[:3]

5903527


[{'text': " Anarchism is an anti-authoritarian political philosophy that rejects hierarchies deemed unjust and advocates their replacement with self-managed, self-governed societies based on voluntary, cooperative institutions. These institutions are often described as stateless societies, although several authors have defined them more specifically as distinct institutions based on non-hierarchical or free associations. Anarchism's central disagreement with other ideologies is that it holds the state to be undesirable, unnecessary, and harmful.  Anarchism is usually placed on the far-left of the political spectrum, and much of its economics and legal philosophy reflect anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism, or participatory economics. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. Anarchist schools of thought can d

In [6]:
title2id = {line['title']: i for i, line in enumerate(all_wiki_ents)}

In [7]:
page2id = {line['idx'].split('=')[-1]: i for i, line in enumerate(all_wiki_ents)}

In [11]:
# test
# https://en.wikipedia.org/wiki?curid=993546 -> Japan national football team
all_wiki_ents[page2id['993546']]['title'] == "Japan national football team"

True

### Re-index function

In [50]:
url = "https://en.wikipedia.org/w/api.php?action=query&pageids=5405&format=json"
r = requests.get(url)

In [51]:
json_data = r.json()

In [58]:
json_data['query']['pages']

{'5405': {'pageid': 5405, 'ns': 0, 'title': 'China'}}

In [55]:
def _get_title_from_api(pageid, client=None):
    url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={pageid}&format=json"

    try:
        # Package the request, send the request and catch the response: r
        r = requests.get(url)

        # Decode the JSON data into a dictionary: json_data
        json_data = r.json()

        if len(json_data["query"]["pages"]) > 1:
            print("WARNING: more than one result returned from wikipedia api")

        for _, v in json_data["query"]["pages"].items():
            title = v["title"]
    except:
        pass
    return title

In [84]:
def reindex(fpath, split):
    fname = fpath+split
    examples = []
    filelines = open(fname).readlines()
    for line in filelines:
        json_line = json.loads(line)
        examples.append(json_line)
    
    for e, example in tqdm(enumerate(examples)):
        old_label_id = example['label_id']
        entity = example['entity']
        
        new_label_id = []
        for i, old_id in enumerate(old_label_id):
            new_id = page2id[str(old_id)]
            new_label_id.append(new_id)
            try:
                assert all_wiki_ents[new_id]['title'] == entity[i]
            except:
                # try compare with wiki url result
                old_id = int(old_id)
                title = id2title.get(old_id)
                if title is None:
                    title = _get_title_from_api(old_id)
                    id2title[old_id] = title
                try:
                    assert all_wiki_ents[new_id]['title'] == title or entity[i] == title 
                except:
                    print(e, ' ', example['id'], ' ', old_id, ' ', new_id, ' ', entity[i], ' ', all_wiki_ents[new_id]['title'])
#                 else:
#                     print(e, ' ', example['id'], ' ', old_id, ' ', new_id, ' ', entity[i], ' ', all_wiki_ents[new_id]['title'])
                entity[i] = all_wiki_ents[new_id]['title']
        example['label_id'] = new_label_id
        example['entity'] = entity
    return examples

In [61]:
#id2title = {}

In [85]:
ds = ['en_desc', 'wiki_content']
splits = ['train.jsonl', 'dev.jsonl', 'test.jsonl']

for d in ds:
    inpath = f'AIDA-YAGO2-{d}-ELQ/tokenized/'
    outpath = f'AIDA-YAGO2-{d}-NEW/tokenized/'
    for split in splits:
        examples = reindex(inpath, split)
        with open(outpath+split, 'w') as wf:
            for example in tqdm(examples):
                b = wf.write(json.dumps(example) + "\n")



0it [00:00, ?it/s][A[A

946it [00:00, 16240.36it/s][A[A

  0%|          | 0/946 [00:00<?, ?it/s][A[A

100%|██████████| 946/946 [00:00<00:00, 11948.50it/s][A[A

0it [00:00, ?it/s][A[A

216it [00:00, 19055.80it/s][A[A

  0%|          | 0/216 [00:00<?, ?it/s][A[A



128   129 Viacom   24580262   2891949   Viacom (1971–2005)   Viacom (original)
235   236 Promodes   2688005   664924   Les Échos (France)   Les Échos (newspaper)
247   248 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
259   260 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
266   267 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
322   323 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
322   323 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
341   342 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
341   342 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
365   366 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
365   366 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
414   415 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
473   474 Senate   403248   171440   Sultan, Crown P

100%|██████████| 216/216 [00:00<00:00, 10868.29it/s][A[A

0it [00:00, ?it/s][A[A

231it [00:00, 7688.64it/s][A[A

  0%|          | 0/231 [00:00<?, ?it/s][A[A

100%|██████████| 231/231 [00:00<00:00, 12061.15it/s][A[A

10   1173testb RUGBY   5746768   1087118   Dan Crowley   Dan Crowley (rugby player)
12   1175testb SOCCER   2384790   610490   AFC Progresul Bucureşti   AS Progresul București
12   1175testb SOCCER   2384790   610490   AFC Progresul Bucureşti   AS Progresul București
39   1202testb SOCCER   616593   235776   Luis Enrique Martínez García   Luis Enrique (footballer)
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   11



0it [00:00, ?it/s][A[A

946it [00:00, 15323.46it/s][A[A

  0%|          | 0/946 [00:00<?, ?it/s][A[A

 67%|██████▋   | 638/946 [00:00<00:00, 6360.96it/s][A[A

43   44 SOCCER   828168   283259   FK Sloga Jugomagnat   FK Shkupi
128   129 Viacom   24580262   2891949   Viacom (1971–2005)   Viacom (original)
235   236 Promodes   2688005   664924   Les Échos (France)   Les Échos (newspaper)
247   248 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
259   260 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
266   267 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
322   323 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
322   323 RUGBY   1196374   372695   Halifax RLFC   Halifax R.L.F.C.
341   342 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
341   342 SOCCER   10410246   1546285   OKS 1945 Olsztyn   Stomil Olsztyn (football)
365   366 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
365   366 SOCCER   1537131   443668   V.C. Eendracht Aalst 2002   SC Eendracht Aalst
414   415 RUGBY   1196374   372695   Halifax RLFC   Ha



100%|██████████| 946/946 [00:00<00:00, 6644.34it/s][A[A

0it [00:00, ?it/s][A[A

216it [00:00, 18440.25it/s][A[A

  0%|          | 0/216 [00:00<?, ?it/s][A[A

100%|██████████| 216/216 [00:00<00:00, 6026.74it/s][A[A

0it [00:00, ?it/s][A[A

231it [00:00, 7139.06it/s][A[A

  0%|          | 0/231 [00:00<?, ?it/s][A[A

100%|██████████| 231/231 [00:00<00:00, 7282.65it/s][A[A

108   1055testa CYCLING   2354465   604821   Rabobank (cycling team)   Team Jumbo–Visma
108   1055testa CYCLING   2354465   604821   Rabobank (cycling team)   Team Jumbo–Visma
108   1055testa CYCLING   2354465   604821   Rabobank (cycling team)   Team Jumbo–Visma
108   1055testa CYCLING   2354465   604821   Rabobank (cycling team)   Team Jumbo–Visma
10   1173testb RUGBY   5746768   1087118   Dan Crowley   Dan Crowley (rugby player)
12   1175testb SOCCER   2384790   610490   AFC Progresul Bucureşti   AS Progresul București
12   1175testb SOCCER   2384790   610490   AFC Progresul Bucureşti   AS Progresul București
22   1185testb SOCCER   26530226   3064628   Arab Contractors (company)   El-Mokawloon El-Arab
39   1202testb SOCCER   616593   235776   Luis Enrique Martínez García   Luis Enrique (footballer)
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238testb Wall   1100754   349968   Newmont Mining Corporation   Newmont Goldcorp
75   1238tes

In [83]:
with open('models/id2title.json', 'w') as f:
    json.dump(id2title, f)