## Data process

Code to process AIDA-CONLL data.

Output json file:

{'doc_id': document id,

 'text': document text,
 
 'mentions': ['EU', 'German', ...],
 
 'mention_ranges': [[1,2],[3,4],...],
 
 'wiki_ids': [0000, 0001, ...]}

Reference: https://github.com/facebookresearch/BLINK/blob/master/scripts/create_BLINK_benchmark_data.py

In [71]:
import json
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torchtext
from bs4 import BeautifulSoup
from tqdm import tqdm

%matplotlib inline

In [106]:
# BEGIN_ENT_TOKEN = "[START_ENT]"
# END_ENT_TOKEN = "[END_ENT]"

#url2id_cache = {}

# with open('url2id.json', 'w') as fout:
#     json.dump(url2id_cache, fout)

with open('url2id.json') as fin:
    url2id_cache = json.load(fin)

In [5]:
def _read_url(url):
    with urllib.request.urlopen(url) as response:
        html = response.read()
        soup = BeautifulSoup(html, features="html.parser")
        title = soup.title.string.replace(" - Wikipedia", "").strip()
    return title

In [4]:
def _get_pageid_from_api(title, client=None):
    pageid = None

    title_html = title.strip().replace(" ", "%20")
    url = "https://en.wikipedia.org/w/api.php?action=query&titles={}&format=json".format(
        title_html
    )

    try:
        # Package the request, send the request and catch the response: r
        r = requests.get(url)

        # Decode the JSON data into a dictionary: json_data
        json_data = r.json()

        if len(json_data["query"]["pages"]) > 1:
            print("WARNING: more than one result returned from wikipedia api")

        for _, v in json_data["query"]["pages"].items():
            pageid = v["pageid"]
    except:
        pass

    return pageid

## Data with first 128 tokens of wikipedia content

In [78]:
id2desc_cache = {}

In [8]:
# read in data file without description
fpath = 'AIDA-YAGO2'

fname = os.path.join(fpath, 'train.json')
with open(fname) as fin:
    train = json.load(fin)

fname = os.path.join(fpath, 'dev.json')
with open(fname) as fin:
    dev = json.load(fin)

fname = os.path.join(fpath, 'test.json')
with open(fname) as fin:
    test = json.load(fin)

In [9]:
len(train), len(dev), len(test)

(946, 216, 231)

In [19]:
datasets = [train, dev, test]
id2title = {}

for dataset in datasets:
    for doc in dataset:
        ids = doc['wiki_ids']
        titles = doc['wiki_titles']
        assert len(ids)==len(titles)
        for i in range(len(ids)):
            id2title[ids[i]] = titles[i]

In [21]:
#len(wiki_ids)
len(id2title)

5598

In [84]:
def get_content_from_api(str_id, tokenizer=tokenizer, max_len=128, client=None):

    url2 = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&pageids={}".format(
        str_id
    )
    try:
        r = requests.get(url2)
        json_data = r.json()

        if len(json_data["query"]["pages"]) > 1:
            print("WARNING: more than one result returned from wikipedia api")

        page = json_data['query']['pages'][str_id]
        title = page['title']

        if title!=id2title[str_id]:
            print(str_id, '  ', title, '  ', id2title[str_id])

        content = page['extract']
        tokens = tokenizer(content)
        return ' '.join(tokens[:128])
    except:
        print(str_id)
        return None

In [81]:
tokenizer = torchtext.data.get_tokenizer("basic_english")

In [85]:
for str_id in tqdm(id2title):
    if str_id in id2desc_cache:
        continue
    token_content = get_content_from_api(str_id, tokenizer)
    id2desc_cache[str_id] = token_content

  1%|          | 34/5598 [00:00<00:28, 192.38it/s]

11095


  1%|          | 59/5598 [00:52<2:23:15,  1.55s/it]

3550889    Channel 2 (Israeli TV channel)    Channel 2 (Israel)


  1%|          | 66/5598 [01:02<1:32:56,  1.01s/it]

17068    Kurds    Kurdish people


  1%|▏         | 70/5598 [01:05<1:03:38,  1.45it/s]

24993749


  2%|▏         | 93/5598 [01:52<1:30:32,  1.01it/s]

210589    Emirate of Sharjah    Sharjah (emirate)


  2%|▏         | 97/5598 [01:57<1:20:34,  1.14it/s]

8821238    Emirate of Abu Dhabi    Abu Dhabi (emirate)


  2%|▏         | 113/5598 [02:17<51:39,  1.77it/s]  

1812373


  2%|▏         | 130/5598 [02:48<1:35:33,  1.05s/it]

54314    Orange S.A.    France Télécom


  2%|▏         | 132/5598 [02:51<1:51:43,  1.23s/it]

11866976    Equinor    Statoil


  3%|▎         | 141/5598 [03:06<1:05:37,  1.39it/s]

1536678    World Series of Golf (unofficial event)    World Series of Golf


  3%|▎         | 144/5598 [03:10<1:19:40,  1.14it/s]

1527695    WGC Invitational    WGC-Bridgestone Invitational


  3%|▎         | 188/5598 [04:00<1:20:03,  1.13it/s]

2384775    ACF Gloria Bistrița    ACF Gloria 1922 Bistriţa


  3%|▎         | 189/5598 [04:03<1:51:00,  1.23s/it]

415846    Bistrița    Bistriţa


  4%|▎         | 203/5598 [04:18<28:30,  3.15it/s]  

24238381


  4%|▎         | 206/5598 [04:25<2:56:00,  1.96s/it]

495296    Kimiko Date    Kimiko Date Krumm


  4%|▍         | 212/5598 [04:34<1:45:30,  1.18s/it]

8210131    New York (state)    New York


  4%|▍         | 232/5598 [05:00<1:11:08,  1.26it/s]

18417732


  4%|▍         | 246/5598 [05:17<1:06:35,  1.34it/s]

58681    Worcester, England    Worcester


  5%|▍         | 257/5598 [05:31<1:31:40,  1.03s/it]

896405    Mushtaq Ahmed (cricketer)    Mushtaq Ahmed


  5%|▍         | 258/5598 [05:32<1:13:15,  1.21it/s]

1958830    Mohammad Akram (cricketer)    Mohammad Akram


  5%|▍         | 261/5598 [05:35<1:20:42,  1.10it/s]

938863    Ijaz Ahmed (cricketer, born 1968)    Ijaz Ahmed (cricketer)


  5%|▍         | 276/5598 [06:09<3:30:12,  2.37s/it]

883369    Craig Brown (footballer, born 1940)    Craig Brown (footballer)


  5%|▌         | 283/5598 [06:33<5:09:00,  3.49s/it]

1872219    Canadian Open (tennis)    Rogers Cup (tennis)


  5%|▌         | 307/5598 [06:55<1:49:35,  1.24s/it]

23297    Pat Rafter    Patrick Rafter


  6%|▌         | 323/5598 [07:15<1:01:48,  1.42it/s]

5570494    Boland (cricket team)    Boland cricket team


  6%|▌         | 324/5598 [07:15<52:08,  1.69it/s]  

11402216


  6%|▌         | 327/5598 [07:18<1:23:57,  1.05it/s]

6573050    Flamurtari FC    Flamurtari Vlorë


  6%|▌         | 328/5598 [07:19<1:14:22,  1.18it/s]

6156831


  6%|▌         | 338/5598 [07:37<2:29:04,  1.70s/it]

3550878    AEK Larnaca FC    AEK Larnaca


  6%|▌         | 340/5598 [07:42<2:48:00,  1.92s/it]

11371765


  6%|▋         | 351/5598 [08:09<1:44:41,  1.20s/it]

7010481    FC Nyva Vinnytsia    PFC Nyva Vinnytsia


  6%|▋         | 363/5598 [08:39<4:02:58,  2.78s/it]

27792964    NK Olimpija Ljubljana (1945–2005)    NK Olimpija Ljubljana (1911)


  7%|▋         | 369/5598 [08:57<5:03:45,  3.49s/it]

57035    Luxembourg City    Luxembourg (city)


  7%|▋         | 372/5598 [09:03<3:48:34,  2.62s/it]

1561948    NK Varaždin (2012)    NK Varaždin


  7%|▋         | 380/5598 [09:31<3:38:32,  2.51s/it]

861331    AC Sparta Prague    Sparta Prague


  7%|▋         | 393/5598 [09:53<1:52:03,  1.29s/it]

5897396    Hapoel Rishon LeZion F.C.    Hapoel Ironi Rishon LeZion F.C.


  7%|▋         | 396/5598 [10:00<2:48:41,  1.95s/it]

2079306    Myllykosken Pallo −47    MYPA


  7%|▋         | 399/5598 [10:12<4:12:01,  2.91s/it]

828168    KF Shkupi    FK Sloga Jugomagnat


  7%|▋         | 401/5598 [10:12<2:10:55,  1.51s/it]

23564616    North Macedonia    Republic of Macedonia


  7%|▋         | 405/5598 [10:19<2:51:58,  1.99s/it]

249510    UEFA European Championship    UEFA European Football Championship


  7%|▋         | 410/5598 [10:27<1:40:05,  1.16s/it]

2918991    Andreas Andersson (footballer, born 1974)    Andreas Andersson


  7%|▋         | 413/5598 [10:29<1:35:24,  1.10s/it]

187346    Club Athletico Paranaense    Clube Atlético Paranaense


  7%|▋         | 416/5598 [10:31<1:03:34,  1.36it/s]

1367196    Guarani FC    Guarani Futebol Clube


  8%|▊         | 425/5598 [10:43<1:13:46,  1.17it/s]

1861824


  8%|▊         | 434/5598 [11:10<5:00:31,  3.49s/it]

91195    The Championships, Wimbledon    Wimbledon Championships


  8%|▊         | 437/5598 [11:17<3:35:40,  2.51s/it]

18952408    Split, Croatia    Split (city)
172311    The Daily Telegraph (Sydney)    The Daily Telegraph (Australia)


  8%|▊         | 451/5598 [11:34<56:45,  1.51it/s]  

19927785    Meiluawati    Meluawati


  8%|▊         | 461/5598 [11:44<1:15:47,  1.13it/s]

1856445    Alexander Volkov (tennis)    Alexander Vladimirovich Volkov


  9%|▊         | 478/5598 [12:02<1:48:43,  1.27s/it]

702849    Andrei Medvedev (tennis)    Andriy Medvedev


  9%|▊         | 486/5598 [12:14<2:25:23,  1.71s/it]

3079144    Jonathan Stark (tennis)    Jonathan Stark


  9%|▉         | 495/5598 [12:33<2:23:35,  1.69s/it]

1723559    Félix Mantilla (tennis)    Félix Mantilla Botella


  9%|▉         | 518/5598 [12:59<1:44:37,  1.24s/it]

420527    Javier Sánchez (tennis)    Javier Sánchez


  9%|▉         | 525/5598 [13:14<3:12:24,  2.28s/it]

2785107    Carlos Costa (tennis)    Carlos Costa


 10%|▉         | 543/5598 [13:42<1:50:49,  1.32s/it]

1360083    Los Angeles Angels    Los Angeles Angels of Anaheim


 10%|▉         | 544/5598 [13:42<1:26:06,  1.02s/it]

1035828    NewYork-Presbyterian Hospital    NewYork–Presbyterian Hospital


 10%|█         | 567/5598 [14:15<2:45:11,  1.97s/it]

11278    Miami Marlins    Florida Marlins


 10%|█         | 583/5598 [14:43<1:48:28,  1.30s/it]

1261670    Terry Kennedy (baseball)    Terry Kennedy


 11%|█         | 588/5598 [14:51<2:21:25,  1.69s/it]

153856    Cal Ripken Jr.    Cal Ripken, Jr.


 11%|█         | 613/5598 [15:24<3:17:03,  2.37s/it]

834154    NEC Nijmegen    N.E.C. (football club)


 11%|█         | 620/5598 [15:40<3:11:24,  2.31s/it]

23608452    Galatasaray S.K. (football)    Galatasaray S.K. (football team)


 11%|█         | 622/5598 [15:46<3:40:41,  2.66s/it]

2695399    Artur Jorge (footballer, born 1946)    Artur Jorge (footballer)


 11%|█▏        | 634/5598 [16:01<1:07:52,  1.22it/s]

1173543


 11%|█▏        | 636/5598 [16:03<1:14:44,  1.11it/s]

19302946    Antonio Esposito (footballer, born 1972)    Antonio Esposito


 12%|█▏        | 645/5598 [16:13<1:25:39,  1.04s/it]

15116    Inter Milan    F.C. Internazionale Milano


 12%|█▏        | 655/5598 [16:30<1:40:14,  1.22s/it]

4375906    Raymond Russell (golfer)    Raymond Russell
5340073    David J. Russell    David J Russell


 12%|█▏        | 682/5598 [16:55<1:03:30,  1.29it/s]

4455122    José María Cañizares    José Maria Cañizares


 13%|█▎        | 734/5598 [18:10<1:30:12,  1.11s/it]

2962836    Lynda Chalker    Lynda Chalker, Baroness Chalker of Wallasey


 13%|█▎        | 750/5598 [18:40<2:20:21,  1.74s/it]

1927490


 14%|█▎        | 764/5598 [19:06<1:08:49,  1.17it/s]

2534235


 14%|█▎        | 768/5598 [19:12<1:25:38,  1.06s/it]

47725    Organisation of Islamic Cooperation    Organisation of the Islamic Conference


 14%|█▍        | 770/5598 [19:17<2:04:23,  1.55s/it]

36922    Quran    Qur'an


 14%|█▍        | 778/5598 [19:24<1:18:37,  1.02it/s]

164386    S&P Global Ratings    Standard & Poor's


 14%|█▍        | 783/5598 [19:31<1:38:06,  1.22s/it]

354620    Interior minister    Interior ministry


 14%|█▍        | 791/5598 [19:46<2:31:13,  1.89s/it]

434852    TASS    Information Telegraph Agency of Russia


 14%|█▍        | 793/5598 [19:46<1:24:10,  1.05s/it]

37664


 14%|█▍        | 798/5598 [19:58<3:11:11,  2.39s/it]

751326    Dnevni avaz    Dnevni Avaz


 14%|█▍        | 803/5598 [20:14<3:28:21,  2.61s/it]

5418443    Gorgona Island (Colombia)    Gorgona, Colombia


 15%|█▍        | 835/5598 [21:20<1:43:56,  1.31s/it]

319357    Egyptair    EgyptAir


 15%|█▌        | 841/5598 [21:31<2:51:29,  2.16s/it]

1088372    Hassan Al-Turabi    Hassan al-Turabi


 15%|█▌        | 848/5598 [21:41<1:41:27,  1.28s/it]

47737    Las Vegas    Las Vegas, Nevada


 16%|█▌        | 868/5598 [22:15<1:51:19,  1.41s/it]

18404036


 16%|█▌        | 869/5598 [22:16<1:43:32,  1.31s/it]

2570087    NSDAP/AO    AO


 16%|█▌        | 885/5598 [22:36<1:10:07,  1.12it/s]

5454926    Leipzig/Halle Airport    Halle Airport


 16%|█▌        | 890/5598 [22:45<2:11:51,  1.68s/it]

2186512    Aegon N.V.    Aegon


 16%|█▌        | 897/5598 [22:56<2:35:58,  1.99s/it]

2253457    Federal Foreign Office    Foreign Office (Germany)


 16%|█▋        | 917/5598 [23:26<1:36:37,  1.24s/it]

15156528    National Bank Limited    National Bank (Bangladesh)


 16%|█▋        | 919/5598 [23:29<1:22:36,  1.06s/it]

8312860


 17%|█▋        | 933/5598 [23:56<2:21:17,  1.82s/it]

52389    Independence Day (1996 film)    Independence Day (film)


 17%|█▋        | 949/5598 [24:21<1:31:30,  1.18s/it]

11197621    Australians    Australian people


 17%|█▋        | 963/5598 [24:40<58:20,  1.32it/s]  

585629    Kyiv    Kiev


 17%|█▋        | 976/5598 [24:59<43:57,  1.75it/s]  

24580262    Viacom (1952–2006)    Viacom (1971–2005)


 18%|█▊        | 981/5598 [25:06<1:59:06,  1.55s/it]

324    Academy Awards    Academy Award


 18%|█▊        | 993/5598 [25:27<3:00:04,  2.35s/it]

34104    Williams Grand Prix Engineering    WilliamsF1


 18%|█▊        | 1004/5598 [25:51<2:01:24,  1.59s/it]

3158734    Sauber Motorsport    Sauber


 19%|█▊        | 1038/5598 [26:48<1:19:40,  1.05s/it]

8904211    Northerns (cricket team)    Northerns cricket team


 19%|█▊        | 1039/5598 [26:49<1:24:50,  1.12s/it]

8357907    André Snyman    Andre Snyman


 19%|█▉        | 1051/5598 [27:08<1:55:09,  1.52s/it]

63493    Ken Griffey Jr.    Ken Griffey, Jr.


 19%|█▉        | 1068/5598 [27:26<1:09:17,  1.09it/s]

2747849    Justin Thompson (baseball)    Justin Thompson


 19%|█▉        | 1072/5598 [27:29<49:03,  1.54it/s]  

2372135    Erik Hanson (baseball)    Erik Hanson
2375798


 19%|█▉        | 1076/5598 [27:34<1:23:50,  1.11s/it]

769618    Sporting CP    Sporting Clube de Portugal


 19%|█▉        | 1082/5598 [27:41<1:41:10,  1.34s/it]

172589    FC Porto    F.C. Porto


 19%|█▉        | 1083/5598 [27:43<1:56:38,  1.55s/it]

909545    Primeira Liga    Portuguese Liga


 19%|█▉        | 1086/5598 [27:49<1:50:43,  1.47s/it]

686142    Bundesliga    Fußball-Bundesliga


 19%|█▉        | 1090/5598 [27:54<1:52:30,  1.50s/it]

599978    FC Hansa Rostock    F.C. Hansa Rostock


 20%|█▉        | 1103/5598 [28:06<53:34,  1.40it/s]  

34453


 20%|█▉        | 1114/5598 [28:18<1:16:26,  1.02s/it]

4375908    Steve Webster (golfer)    Steve Webster


 20%|██        | 1125/5598 [28:29<44:38,  1.67it/s]  

923379


 20%|██        | 1146/5598 [28:52<51:16,  1.45it/s]  

7885102    Steve Brown (athlete)    Stephen Brown (athlete)


 21%|██        | 1164/5598 [29:08<55:38,  1.33it/s]  

2573684


 21%|██        | 1176/5598 [29:20<1:29:02,  1.21s/it]

653621    Michael Johnson (sprinter)    Michael Johnson (athlete)


 21%|██        | 1184/5598 [29:29<1:14:10,  1.01s/it]

7652893    Mark Richardson (sprinter)    Mark Richardson (athlete)


 21%|██        | 1188/5598 [29:32<46:35,  1.58it/s]  

27446348    Jeff Williams (sprinter)    Jeff Williams (athlete)


 21%|██▏       | 1190/5598 [29:34<1:04:49,  1.13it/s]

681338    Michael Marsh (sprinter)    Michael Marsh (athlete)


 21%|██▏       | 1191/5598 [29:35<1:01:52,  1.19it/s]

2688112    Iván García (athlete)    Iván García


 22%|██▏       | 1204/5598 [29:44<41:37,  1.76it/s]  

3058737    Bob Kennedy (runner)    Bob Kennedy (athlete)


 22%|██▏       | 1212/5598 [29:52<1:18:09,  1.07s/it]

52056    Jonathan Edwards (triple jumper)    Jonathan Edwards (athlete)


 22%|██▏       | 1244/5598 [30:40<4:20:47,  3.59s/it]

1965301    António Oliveira (footballer, born 1952)    António Luís Alves Ribeiro Oliveira


 22%|██▏       | 1249/5598 [30:47<2:06:10,  1.74s/it]

4935997    Hélder (footballer)    Hélder Cristóvão


 22%|██▏       | 1251/5598 [30:49<1:46:00,  1.46s/it]

2135282    Dimas (footballer)    Dimas Teixeira


 22%|██▏       | 1254/5598 [30:51<59:13,  1.22it/s]  

353781


 22%|██▏       | 1256/5598 [30:55<1:38:39,  1.36s/it]

3304564    Oceano Cruz    Oceano da Cruz


 22%|██▏       | 1258/5598 [30:59<2:02:57,  1.70s/it]

1254376    João Pinto    João Vieira Pinto


 23%|██▎       | 1292/5598 [31:58<2:50:37,  2.38s/it]

677536    Legia Warsaw    Legia Warszawa


 23%|██▎       | 1297/5598 [32:05<1:43:49,  1.45s/it]

1167720    FC Spartak Vladikavkaz    FC Alania Vladikavkaz


 23%|██▎       | 1301/5598 [32:12<1:30:37,  1.27s/it]

23538966    World Series Cricket World XI    WSC World XI


 23%|██▎       | 1306/5598 [32:20<1:38:00,  1.37s/it]

4014249    National Assembly (Angola)    National Assembly of Angola


 24%|██▎       | 1322/5598 [32:34<1:04:26,  1.11it/s]

7971634


 24%|██▎       | 1328/5598 [32:45<2:41:58,  2.28s/it]

265716    Constanța    Constanţa


 24%|██▍       | 1332/5598 [32:50<1:51:02,  1.56s/it]

229431    Saint Croix    Saint Croix, U.S. Virgin Islands


 24%|██▍       | 1341/5598 [33:03<1:17:52,  1.10s/it]

80628    Eleusis    Eleusina


 24%|██▍       | 1343/5598 [33:08<1:54:53,  1.62s/it]

159433    Arab world    Arab World


 24%|██▍       | 1349/5598 [33:18<1:46:48,  1.51s/it]

2185    Arabs    Arab people


 24%|██▍       | 1354/5598 [33:25<1:29:10,  1.26s/it]

3567921    Mark Kennedy (footballer, born 1976)    Mark Kennedy (footballer)


 25%|██▍       | 1373/5598 [33:51<1:00:44,  1.16it/s]

10601955    Croatian News Agency    HINA


 25%|██▌       | 1402/5598 [34:45<1:21:28,  1.17s/it]

307576    Al-Karak    Al Karak


 25%|██▌       | 1414/5598 [35:07<1:35:05,  1.36s/it]

8122677    Masoud Barzani    Massoud Barzani


 25%|██▌       | 1420/5598 [35:12<40:40,  1.71it/s]  

9696755


 25%|██▌       | 1425/5598 [35:16<1:02:56,  1.10it/s]

505437    Bill Weld    William Weld


 25%|██▌       | 1426/5598 [35:17<1:07:50,  1.03it/s]

843793    WBUR-FM    WBUR


 26%|██▌       | 1433/5598 [35:33<3:13:53,  2.79s/it]

4096004    Sri Lankan Tamils    Sri Lankan Tamil people


 26%|██▌       | 1449/5598 [36:03<43:59,  1.57it/s]  

566384    Duhok    Duhok, Iraq


 26%|██▌       | 1459/5598 [36:12<1:10:01,  1.02s/it]

26213969


 26%|██▌       | 1462/5598 [36:15<1:02:33,  1.10it/s]

217560    Chihuahua City    Chihuahua, Chihuahua


 27%|██▋       | 1496/5598 [36:47<32:38,  2.09it/s]  

15842094    Boskalis    Royal Boskalis Westminster


 27%|██▋       | 1524/5598 [37:26<1:02:09,  1.09it/s]

370229    SoftBank Group    SoftBank


 27%|██▋       | 1527/5598 [37:28<48:34,  1.40it/s]  

2688005    Les Echos (France)    Les Échos (France)


 27%|██▋       | 1529/5598 [37:28<31:10,  2.18it/s]

8240811    DIA (supermarket chain)    Dia (supermarket chain)


 27%|██▋       | 1537/5598 [37:36<53:06,  1.27it/s]  

243710


 28%|██▊       | 1547/5598 [37:49<1:06:47,  1.01it/s]

1085647    David Richards (motorsport)    David Richards (racing)


 28%|██▊       | 1558/5598 [38:05<1:13:43,  1.09s/it]

800001    Arrows Grand Prix International    Arrows


 28%|██▊       | 1567/5598 [38:14<58:05,  1.16it/s]  

5690457    Paris Saint-Germain Rugby League    Paris Saint Germain (rugby league team)


 28%|██▊       | 1570/5598 [38:21<2:08:22,  1.91s/it]

1095032    St Helens R.F.C.    St Helens RLFC


 28%|██▊       | 1572/5598 [38:23<1:33:35,  1.39s/it]

629563    Wasps RFC    London Wasps


 28%|██▊       | 1574/5598 [38:26<1:33:23,  1.39s/it]

1196374    Halifax Panthers    Halifax RLFC


 28%|██▊       | 1577/5598 [38:28<1:06:47,  1.00it/s]

1098719    Oldham R.L.F.C.    Oldham Roughyeds


 28%|██▊       | 1581/5598 [38:31<44:02,  1.52it/s]  

6136803    Matt Windows    Matthew Windows


 28%|██▊       | 1588/5598 [38:36<50:34,  1.32it/s]

24080381    Gordon Parsons (cricketer)    Gordon Parsons


 29%|██▊       | 1603/5598 [38:59<2:15:20,  2.03s/it]

1216952    Airdrieonians F.C. (1878)    Airdrieonians F.C.


 29%|██▉       | 1611/5598 [39:12<1:42:40,  1.55s/it]

783089    St Mirren F.C.    St. Mirren F.C.


 29%|██▉       | 1612/5598 [39:12<1:19:22,  1.19s/it]

393756    St Johnstone F.C.    St. Johnstone F.C.


 29%|██▉       | 1637/5598 [39:41<48:05,  1.37it/s]  

4249924    Peter Atherton (footballer)    Peter Atherton


 30%|██▉       | 1652/5598 [40:10<2:24:15,  2.19s/it]

7609513    Leeds United F.C.    Leeds United A.F.C.


 30%|██▉       | 1673/5598 [40:44<51:54,  1.26it/s]  

236211    Huddersfield Town A.F.C.    Huddersfield Town F.C.


 30%|███       | 1693/5598 [41:18<2:58:39,  2.75s/it]

413286    Wrexham A.F.C.    Wrexham F.C.


 30%|███       | 1698/5598 [41:21<1:02:17,  1.04it/s]

451121    AFC Bournemouth    A.F.C. Bournemouth


 31%|███       | 1739/5598 [42:17<1:18:48,  1.23s/it]

17634183    OFK Kikinda    FK Kikinda


 31%|███       | 1740/5598 [42:19<1:33:41,  1.46s/it]

20517361    FK Spartak Subotica    FK Spartak Zlatibor Voda


 31%|███       | 1748/5598 [42:32<1:42:10,  1.59s/it]

10410246    OKS Stomil Olsztyn    OKS 1945 Olsztyn


 31%|███▏      | 1754/5598 [42:41<1:52:46,  1.76s/it]

677603    Polonia Warsaw    Polonia Warszawa


 31%|███▏      | 1762/5598 [42:50<1:19:13,  1.24s/it]

3647082    Barangay Ginebra San Miguel    Barangay Ginebra Kings


 32%|███▏      | 1772/5598 [43:02<1:30:52,  1.43s/it]

5960297    Márcio Santos (footballer, born 1969)    Márcio Roberto dos Santos


 32%|███▏      | 1783/5598 [43:22<1:05:00,  1.02s/it]

1520840


 32%|███▏      | 1786/5598 [43:26<1:14:25,  1.17s/it]

1537131    S.C. Eendracht Aalst    V.C. Eendracht Aalst 2002


 32%|███▏      | 1789/5598 [43:29<1:15:48,  1.19s/it]

1437246    Sint-Truidense V.V.    K. Sint-Truidense V.V.


 32%|███▏      | 1801/5598 [43:44<49:40,  1.27it/s]  

1693070    En Avant Guingamp    En Avant de Guingamp


 32%|███▏      | 1810/5598 [43:59<1:46:37,  1.69s/it]

1256446    RC Strasbourg Alsace    RC Strasbourg


 33%|███▎      | 1825/5598 [44:22<2:41:36,  2.57s/it]

834256    SBV Vitesse    Vitesse


 33%|███▎      | 1828/5598 [44:30<2:52:55,  2.75s/it]

834196    Roda JC Kerkrade    Roda JC


 33%|███▎      | 1830/5598 [44:32<1:50:36,  1.76s/it]

2289    AZ Alkmaar    AZ (football club)


 33%|███▎      | 1840/5598 [44:43<1:21:46,  1.31s/it]

1007441    TSV 1860 Munich    TSV 1860 München


 33%|███▎      | 1858/5598 [45:10<1:23:42,  1.34s/it]

5644175    LASK    LASK Linz


 33%|███▎      | 1870/5598 [45:31<1:53:07,  1.82s/it]

642604    Blantyre    Blantyre, Malawi


 34%|███▎      | 1880/5598 [45:50<1:05:23,  1.06s/it]

420869    Léon Kengo wa Dondo    Kengo Wa Dondo


 34%|███▎      | 1884/5598 [45:52<34:25,  1.80it/s]  

19997199    Xhosa people    Xhosa


 34%|███▍      | 1911/5598 [46:40<1:09:39,  1.13s/it]

24534207    Erbil    Arbil


 34%|███▍      | 1912/5598 [46:40<53:47,  1.14it/s]  

270067


 34%|███▍      | 1927/5598 [47:04<1:39:14,  1.62s/it]

9926    ETA (separatist group)    ETA


 34%|███▍      | 1930/5598 [47:10<1:50:01,  1.80s/it]

374695    Implementation Force    IFOR


 35%|███▍      | 1932/5598 [47:12<1:10:29,  1.15s/it]

842061


 35%|███▍      | 1953/5598 [47:41<1:02:58,  1.04s/it]

18642444


 35%|███▍      | 1959/5598 [47:47<1:23:31,  1.38s/it]

676299    Jamaat-e-Islami Pakistan    Jamaat-e-Islami


 35%|███▌      | 1968/5598 [47:55<49:23,  1.22it/s]  

319632    Verisign    VeriSign


 35%|███▌      | 1973/5598 [47:58<29:14,  2.07it/s]  

299967    Kleiner Perkins    Kleiner Perkins Caufield & Byers
18622259


 35%|███▌      | 1974/5598 [48:01<1:10:51,  1.17s/it]

30107


 36%|███▌      | 1996/5598 [48:19<49:47,  1.21it/s]  

4632360    Sébastien Tortelli    Sebastien Tortelli


 36%|███▌      | 2000/5598 [48:24<1:14:08,  1.24s/it]

748984    Suzuka International Racing Course    Suzuka Circuit


 36%|███▌      | 2005/5598 [48:28<57:28,  1.04it/s]  

1233974    Jean-Denis Delétraz    Jean-Denis Délétraz


 36%|███▌      | 2014/5598 [48:36<1:02:00,  1.04s/it]

5166840    Franz Konrad (racing driver)    Franz Konrad


 36%|███▌      | 2015/5598 [48:37<55:55,  1.07it/s]  

23507562    Cor Euser    Cornelius Euser


 36%|███▌      | 2027/5598 [48:50<48:37,  1.22it/s]  

24234976    Margaret Crowley (runner)    Margaret Crowley (athlete)


 37%|███▋      | 2049/5598 [49:08<47:44,  1.24it/s]  

8653    Ducati Motor Holding S.p.A.    Ducati


 37%|███▋      | 2063/5598 [49:20<1:09:02,  1.17s/it]

1042270    London Broncos    Harlequins Rugby League


 37%|███▋      | 2073/5598 [49:32<58:54,  1.00s/it]  

159024    Michael Atherton    Mike Atherton


 37%|███▋      | 2075/5598 [49:33<44:13,  1.33it/s]

994718    Nick Knight (cricketer)    Nick Knight


 38%|███▊      | 2100/5598 [50:04<2:03:38,  2.12s/it]

2958305    CSM Ceahlăul Piatra Neamț    FC Ceahlăul Piatra Neamţ


 38%|███▊      | 2101/5598 [50:07<2:15:35,  2.33s/it]

3569554


 38%|███▊      | 2102/5598 [50:08<1:52:41,  1.93s/it]

2498018    FC Sportul Studențesc București    FC Sportul Studenţesc Bucureşti


 38%|███▊      | 2103/5598 [50:10<1:56:50,  2.01s/it]

2384859    FC U Craiova 1948    FC Universitatea Craiova


 38%|███▊      | 2104/5598 [50:11<1:48:40,  1.87s/it]

3574206    CSM Jiul Petroșani    CS Jiul Petroşani


 38%|███▊      | 2105/5598 [50:12<1:21:55,  1.41s/it]

861358    FC Dinamo București    FC Dinamo Bucureşti


 38%|███▊      | 2107/5598 [50:15<1:21:44,  1.40s/it]

1240318    FC Rapid București    FC Rapid Bucureşti


 38%|███▊      | 2108/5598 [50:15<1:00:55,  1.05s/it]

893411    FCSB    FC Steaua Bucureşti


 38%|███▊      | 2118/5598 [50:38<1:31:26,  1.58s/it]

5826257    FC Lada-Tolyatti    FC Lada Togliatti


 38%|███▊      | 2120/5598 [50:45<2:28:38,  2.56s/it]

1343724    PFC Krylia Sovetov Samara    FC Krylia Sovetov Samara


 38%|███▊      | 2122/5598 [50:48<2:00:00,  2.07s/it]

5683235    FC Ural Yekaterinburg    FC Ural Sverdlovsk Oblast


 38%|███▊      | 2151/5598 [51:25<54:23,  1.06it/s]  

885301    Manly Warringah Sea Eagles    Manly-Warringah Sea Eagles


 39%|███▊      | 2163/5598 [51:47<1:31:42,  1.60s/it]

11545810    Lisbet Stuer-Lauridsen    Lisbeth Stuer-Lauridsen


 39%|███▊      | 2169/5598 [51:58<1:25:53,  1.50s/it]

15475685    Kyle Abbott (baseball)    Kyle Abbott


 39%|███▉      | 2172/5598 [51:59<48:53,  1.17it/s]  

18940106    David Hulse    David Hulse (baseball)


 39%|███▉      | 2207/5598 [52:41<1:11:49,  1.27s/it]

18884152    Paulão (footballer, born 1985)    Paulo Roberto do Carmo


 39%|███▉      | 2208/5598 [52:42<1:12:01,  1.27s/it]

1445352    João Pinto (footballer, born 1961)    João Domingos Pinto


 40%|███▉      | 2230/5598 [53:19<1:36:06,  1.71s/it]

1171960    Kongsvinger IL Toppfotball    KIL Toppfotball


 41%|████      | 2279/5598 [54:12<1:10:45,  1.28s/it]

9420663    Vitamina Sánchez    Pablo Sánchez


 41%|████      | 2281/5598 [54:16<1:35:54,  1.73s/it]

563623    Club Brugge KV    Club Brugge K.V.


 41%|████      | 2283/5598 [54:19<1:26:45,  1.57s/it]

564514    K.V. Mechelen    KV Mechelen


 41%|████      | 2302/5598 [54:56<1:30:41,  1.65s/it]

1724515


 42%|████▏     | 2326/5598 [55:29<1:00:29,  1.11s/it]

21710997    NATO intervention in Bosnia and Herzegovina    NATO intervention in Bosnia


 42%|████▏     | 2342/5598 [56:02<1:47:19,  1.98s/it]

19457    Myanmar    Burma


 42%|████▏     | 2343/5598 [56:04<1:34:22,  1.74s/it]

1723280    Mae Sot District    Mae Sot


 42%|████▏     | 2348/5598 [56:09<49:38,  1.09it/s]  

63698


 42%|████▏     | 2355/5598 [56:19<51:57,  1.04it/s]  

18968435    Al Rai (Kuwaiti newspaper)    Al Rai


 42%|████▏     | 2359/5598 [56:25<1:24:34,  1.57s/it]

1114732    Palestine (region)    Palestine


 42%|████▏     | 2361/5598 [56:27<55:19,  1.03s/it]  

55387    Gaza City    Gaza


 42%|████▏     | 2368/5598 [56:38<1:13:40,  1.37s/it]

368881


 42%|████▏     | 2378/5598 [56:48<54:51,  1.02s/it]  

18037690    North Salang    Salang, Afghanistan


 43%|████▎     | 2386/5598 [57:05<1:57:36,  2.20s/it]

404861    Awami League    Bangladesh Awami League


 43%|████▎     | 2390/5598 [57:11<1:40:22,  1.88s/it]

432397    John Deere    Deere & Company


 43%|████▎     | 2409/5598 [57:40<56:02,  1.05s/it]  

1427983


 44%|████▍     | 2453/5598 [58:56<3:00:34,  3.45s/it]

1030243    DR Congo national football team    Congo DR national football team


 44%|████▍     | 2465/5598 [59:25<56:50,  1.09s/it]  

7011579    FC Torpedo Zaporizhzhia    FC Torpedo Zaporizhya


 44%|████▍     | 2467/5598 [59:30<1:16:52,  1.47s/it]

11225904


 44%|████▍     | 2470/5598 [59:35<1:27:39,  1.68s/it]

895551    FC Dnipro    FC Dnipro Dnipropetrovsk


 44%|████▍     | 2483/5598 [59:55<1:24:04,  1.62s/it]

895533
6157701    FC DAC 1904 Dunajská Streda    FK DAC 1904 Dunajská Streda


 44%|████▍     | 2488/5598 [1:00:06<1:50:26,  2.13s/it]

5508993    FC VSS Košice    MFK Košice


 44%|████▍     | 2491/5598 [1:00:10<1:19:17,  1.53s/it]

4414792    FK Dubnica    MFK Dubnica


 45%|████▍     | 2493/5598 [1:00:12<1:00:08,  1.16s/it]

3205346    MTK Budapest FC    MTK Hungária FC
5798281    Budapesti VSC    BVSC Budapest


 45%|████▍     | 2496/5598 [1:00:14<45:30,  1.14it/s]  

2139341


 45%|████▍     | 2499/5598 [1:00:18<48:40,  1.06it/s]  

2235024


 45%|████▍     | 2501/5598 [1:00:19<40:36,  1.27it/s]

1650918    1. FK Drnovice    FK Drnovice


 45%|████▍     | 2503/5598 [1:00:22<44:21,  1.16it/s]

896034    SK Slavia Prague    Slavia Prague


 45%|████▍     | 2505/5598 [1:00:24<1:00:39,  1.18s/it]

4991040    FK Jablonec    FK Baumit Jablonec


 45%|████▍     | 2516/5598 [1:00:41<1:07:43,  1.32s/it]

859774    Atlas F.C.    Club Atlas


 45%|████▌     | 2523/5598 [1:00:55<1:22:33,  1.61s/it]

1193316    C.D. Veracruz    Tiburones Rojos de Veracruz


 45%|████▌     | 2526/5598 [1:00:56<35:05,  1.46it/s]  

1511135    Club Puebla    Puebla F.C.


 45%|████▌     | 2527/5598 [1:00:56<26:32,  1.93it/s]

1217587    Santos Laguna    Club Santos Laguna


 45%|████▌     | 2528/5598 [1:00:56<21:58,  2.33it/s]

1035876    Atlético Morelia    Monarcas Morelia


 45%|████▌     | 2530/5598 [1:00:58<30:01,  1.70it/s]

1511138    Tecos F.C.    Estudiantes Tecos


 45%|████▌     | 2533/5598 [1:01:02<1:01:03,  1.20s/it]

425696    Celaya F.C.    Club Celaya


 45%|████▌     | 2544/5598 [1:01:23<1:28:10,  1.73s/it]

2335808    Túlio Maravilha    Túlio Costa


 46%|████▌     | 2552/5598 [1:01:35<1:46:07,  2.09s/it]

1482035    Club Atlético Vélez Sarsfield    Club Atlético Vélez Sársfield


 46%|████▌     | 2565/5598 [1:01:53<45:52,  1.10it/s]  

187313    Fluminense FC    Fluminense Football Club


 46%|████▌     | 2569/5598 [1:02:04<2:00:18,  2.38s/it]

1689806    Raja CA    Raja Casablanca


 46%|████▌     | 2570/5598 [1:02:05<1:40:35,  1.99s/it]

6230254    JS Massira    Chabab Massira


 46%|████▌     | 2571/5598 [1:02:06<1:15:26,  1.50s/it]

1690796


 46%|████▌     | 2572/5598 [1:02:06<1:00:35,  1.20s/it]

6230182    Olympique Club de Khouribga    Olympique Khouribga


 46%|████▌     | 2577/5598 [1:02:16<1:30:35,  1.80s/it]

963152    Spaniards    Spanish people


 46%|████▋     | 2590/5598 [1:02:35<1:31:30,  1.83s/it]

15107593


 47%|████▋     | 2619/5598 [1:03:12<42:43,  1.16it/s]  

293444


 47%|████▋     | 2632/5598 [1:03:26<52:22,  1.06s/it]  

849629    Édgar Rentería    Edgar Rentería


 48%|████▊     | 2661/5598 [1:04:08<52:23,  1.07s/it]  

26413    Real Madrid CF    Real Madrid C.F.


 48%|████▊     | 2664/5598 [1:04:13<1:40:42,  2.06s/it]

66164    Ronaldo (Brazilian footballer)    Ronaldo


 48%|████▊     | 2667/5598 [1:04:16<1:07:11,  1.38s/it]

28357832    1996 Singer World Series    Singer World Series


 48%|████▊     | 2685/5598 [1:04:32<30:11,  1.61it/s]  

2877538    Craig Evans (Zimbabwean sportsman)    Craig Evans


 48%|████▊     | 2692/5598 [1:04:34<14:04,  3.44it/s]

13015878    Washington (state)    Washington (U.S. state)


 48%|████▊     | 2699/5598 [1:04:48<1:04:28,  1.33s/it]

19792942    Americans    People of the United States


 49%|████▊     | 2723/5598 [1:05:23<43:01,  1.11it/s]  

88918    Preston, Lancashire    Preston


 49%|████▊     | 2725/5598 [1:05:28<1:18:11,  1.63s/it]

350800    North Shore, New Zealand    North Shore City


 49%|████▊     | 2727/5598 [1:05:30<1:00:06,  1.26s/it]

1049625    Intercontinental Exchange Futures    International Petroleum Exchange


 49%|████▊     | 2728/5598 [1:05:32<1:03:24,  1.33s/it]

840431


 49%|████▉     | 2737/5598 [1:05:45<1:04:32,  1.35s/it]

403248    Sultan bin Abdulaziz, Crown Prince of Saudi Arabia    Sultan, Crown Prince of Saudi Arabia


 49%|████▉     | 2744/5598 [1:05:55<1:04:45,  1.36s/it]

4063907    Rawandiz    Rowanduz


 49%|████▉     | 2750/5598 [1:06:06<1:30:04,  1.90s/it]

323202    Aluminium division of Rio Tinto    Rio Tinto Alcan


 49%|████▉     | 2753/5598 [1:06:07<37:44,  1.26it/s]  

541026    Asarco    ASARCO


 49%|████▉     | 2755/5598 [1:06:08<38:03,  1.24it/s]

3037636    Georgiy Mamedov    Georgiy Mammadov


 49%|████▉     | 2770/5598 [1:06:28<42:12,  1.12it/s]  

222294    John Hinckley Jr.    John Hinckley, Jr.
362719    Superman (1978 film)    Superman (film)


 50%|████▉     | 2773/5598 [1:06:29<23:58,  1.96it/s]

27687    St. Louis    St. Louis, Missouri


 50%|████▉     | 2783/5598 [1:06:45<55:12,  1.18s/it]  

1076420    Repsol    Repsol YPF


 50%|████▉     | 2785/5598 [1:06:47<41:58,  1.12it/s]

58185


 50%|████▉     | 2789/5598 [1:06:51<50:35,  1.08s/it]

272329    Environment and Climate Change Canada    Environment Canada


 50%|████▉     | 2794/5598 [1:06:55<43:49,  1.07it/s]

749287    Peace River    Peace River (Canada)


 50%|████▉     | 2796/5598 [1:06:56<29:01,  1.61it/s]

8372532    Materion    Brush Engineered Materials


 50%|████▉     | 2798/5598 [1:06:58<39:28,  1.18it/s]

306926    Valero Energy    Valero Energy Corporation


 50%|█████     | 2806/5598 [1:07:06<36:53,  1.26it/s]  

5043192    Hillary Clinton    Hillary Rodham Clinton


 50%|█████     | 2811/5598 [1:07:08<21:23,  2.17it/s]

2186586


 50%|█████     | 2818/5598 [1:07:13<23:29,  1.97it/s]

71486


 50%|█████     | 2820/5598 [1:07:15<32:28,  1.43it/s]

749262    Bank of France    Banque de France


 51%|█████     | 2843/5598 [1:07:40<45:05,  1.02it/s]  

300548    Tamils    Tamil people


 51%|█████     | 2848/5598 [1:07:42<22:39,  2.02it/s]

1786946    Hanson (company)    Hanson plc


 51%|█████     | 2855/5598 [1:07:49<37:50,  1.21it/s]

9058746    Tarmac Group    Tarmac (company)


 51%|█████     | 2860/5598 [1:07:51<15:44,  2.90it/s]

740993    Seve Ballesteros    Severiano Ballesteros


 51%|█████     | 2867/5598 [1:08:02<45:54,  1.01s/it]  

23213228    Jimmy Thomson (footballer, born 1937)    Jimmy Thomson (footballer)


 51%|█████▏    | 2871/5598 [1:08:06<40:05,  1.13it/s]  

899013    Alan Ball Jr.    Alan Ball, Jr.


 51%|█████▏    | 2879/5598 [1:08:12<30:15,  1.50it/s]

3976751    Ata-ur-Rehman (cricketer)    Ata-ur-Rehman
9568


 51%|█████▏    | 2882/5598 [1:08:14<29:02,  1.56it/s]

5210442    Dănuț Lupu    Dănuţ Lupu


 52%|█████▏    | 2891/5598 [1:08:21<31:39,  1.43it/s]

6849383    Joseph Kneipp    Joe Kneipp


 53%|█████▎    | 2948/5598 [1:09:37<39:35,  1.12it/s]  

18939866    Ángel Miranda    Angel Miranda


 53%|█████▎    | 2954/5598 [1:09:46<1:12:03,  1.64s/it]

978947    Mary Joe Fernández    Mary Joe Fernandez


 53%|█████▎    | 2958/5598 [1:09:52<1:07:35,  1.54s/it]

1728044    Krasimir Balakov    Krassimir Balakov


 53%|█████▎    | 2978/5598 [1:10:18<1:21:29,  1.87s/it]

2228737    Neuchâtel Xamax FCS    Neuchâtel Xamax


 53%|█████▎    | 2981/5598 [1:10:23<1:01:12,  1.40s/it]

2043091


 53%|█████▎    | 2986/5598 [1:10:28<40:04,  1.09it/s]  

11065220    Tour of the Netherlands    Ronde van Nederland
102081    Gouda, South Holland    Gouda


 53%|█████▎    | 2988/5598 [1:10:29<31:38,  1.37it/s]

2354465    Team Jumbo–Visma (men's team)    Rabobank (cycling team)


 54%|█████▎    | 3005/5598 [1:10:43<43:47,  1.01s/it]

1488    NYSE American    American Stock Exchange


 54%|█████▍    | 3015/5598 [1:10:53<34:57,  1.23it/s]  

2767206    Gary Kelly (footballer, born 1974)    Gary Kelly (footballer born 1974)


 54%|█████▍    | 3018/5598 [1:11:00<1:10:24,  1.64s/it]

8298264    Alan Kelly Sr.    Alan Kelly, Sr.


 54%|█████▍    | 3035/5598 [1:11:16<38:08,  1.12it/s]  

3589053    David Kelly (association footballer)    David Kelly (footballer)


 54%|█████▍    | 3041/5598 [1:11:25<46:27,  1.09s/it]  

2711386


 55%|█████▍    | 3057/5598 [1:11:56<1:22:03,  1.94s/it]

389777    Perm, Russia    Perm


 55%|█████▍    | 3061/5598 [1:11:58<36:21,  1.16it/s]  

4315911    Enel Distribution Sao Paulo    AES Eletropaulo


 55%|█████▍    | 3063/5598 [1:12:05<1:18:54,  1.87s/it]

4277114


 55%|█████▍    | 3067/5598 [1:12:07<47:22,  1.12s/it]  

2226978    Matagalpa, Nicaragua    Matagalpa


 55%|█████▍    | 3075/5598 [1:12:17<38:14,  1.10it/s]  

3179999


 55%|█████▌    | 3095/5598 [1:12:44<58:16,  1.40s/it]  

619184    Bob Casey Sr.    Robert P. Casey


 56%|█████▌    | 3117/5598 [1:13:09<37:30,  1.10it/s]  

380769    Rambo (franchise)    Rambo (film series)


 56%|█████▌    | 3133/5598 [1:13:22<34:34,  1.19it/s]

355334    Getlink    Eurotunnel


 56%|█████▌    | 3141/5598 [1:13:30<48:29,  1.18s/it]

24586489    Chavakachcheri    Chavakacheri


 56%|█████▌    | 3144/5598 [1:13:36<1:02:55,  1.54s/it]

504790    New York Daily News    Daily News (New York)


 56%|█████▋    | 3150/5598 [1:13:40<23:28,  1.74it/s]  

1150534    The Statesman (India)    The Statesman


 56%|█████▋    | 3159/5598 [1:13:53<1:07:59,  1.67s/it]

403685    Communist Party of Nepal (Maoist Centre)    Unified Communist Party of Nepal (Maoist)


 57%|█████▋    | 3168/5598 [1:13:58<20:59,  1.93it/s]  

740431    Dr. Ambedkar Nagar    Mhow


 57%|█████▋    | 3170/5598 [1:13:59<24:10,  1.67it/s]

5706727    Ashta, Madhya Pradesh    Ashta


 57%|█████▋    | 3180/5598 [1:14:10<55:44,  1.38s/it]  

1165422    VTM (TV channel)    Vtm


 57%|█████▋    | 3188/5598 [1:14:25<54:49,  1.36s/it]  

4526656


 57%|█████▋    | 3198/5598 [1:14:45<1:55:09,  2.88s/it]

12246    Alliance 90/The Greens    The Greens


 57%|█████▋    | 3200/5598 [1:14:47<1:16:17,  1.91s/it]

19902641    Forsa Institute    Forsa institute


 57%|█████▋    | 3203/5598 [1:14:52<1:00:37,  1.52s/it]

13595    Heathrow Airport    London Heathrow Airport


 57%|█████▋    | 3205/5598 [1:14:52<33:33,  1.19it/s]  

24355    Perth    Perth, Western Australia


 57%|█████▋    | 3208/5598 [1:14:54<28:56,  1.38it/s]

264528


 57%|█████▋    | 3209/5598 [1:14:56<41:08,  1.03s/it]

466743    Sakai    Sakai, Osaka


 58%|█████▊    | 3232/5598 [1:15:31<46:27,  1.18s/it]  

2450892    Sergey Makarov (javelin thrower)    Sergey Alexandrovich Makarov


 58%|█████▊    | 3237/5598 [1:15:33<18:33,  2.12it/s]

2466432


 58%|█████▊    | 3261/5598 [1:15:54<37:07,  1.05it/s]

2054489    Jean van de Velde (golfer)    Jean Van de Velde


 58%|█████▊    | 3267/5598 [1:16:01<53:54,  1.39s/it]

796027    U.S. Postal Service Pro Cycling Team    Discovery Channel Pro Cycling Team


 59%|█████▊    | 3280/5598 [1:16:15<35:23,  1.09it/s]  

16745041    Pavel Buráň    Pavel Buran


 59%|█████▉    | 3301/5598 [1:16:37<1:03:06,  1.65s/it]

56635    Chișinău    Chişinău


 59%|█████▉    | 3306/5598 [1:16:42<31:58,  1.19it/s]  

68032    St Leger Stakes    St. Leger Stakes


 59%|█████▉    | 3308/5598 [1:16:44<29:11,  1.31it/s]

68030    2000 Guineas Stakes    2,000 Guineas Stakes


 60%|█████▉    | 3339/5598 [1:17:18<33:20,  1.13it/s]  

4359711    John Smiley (baseball)    John Smiley


 60%|█████▉    | 3355/5598 [1:17:35<38:16,  1.02s/it]

18523612    South Africa men's national field hockey team    South Africa national field hockey team


 60%|█████▉    | 3357/5598 [1:17:37<34:24,  1.09it/s]

6904363    Western Province cricket team (South Africa)    Western Province cricket team


 60%|██████    | 3368/5598 [1:17:46<20:40,  1.80it/s]

3182138    Dave Richardson (South African cricketer)    Dave Richardson


 60%|██████    | 3383/5598 [1:18:03<42:39,  1.16s/it]  

24449519    Derek Ryan (squash player)    Derek Ryan


 61%|██████    | 3400/5598 [1:18:23<44:06,  1.20s/it]  

2809984    Connecticut Open (tennis)    Pilot Pen Tennis


 61%|██████    | 3402/5598 [1:18:26<48:16,  1.32s/it]

16001137    Marcelo Ramos (footballer, born 1973)    Marcelo Silva Ramos


 61%|██████    | 3410/5598 [1:18:38<28:54,  1.26it/s]  

570925


 61%|██████    | 3417/5598 [1:18:48<43:07,  1.19s/it]  

3521904    Dani (footballer, born 1976)    Daniel da Cruz Carvalho


 61%|██████    | 3418/5598 [1:18:48<34:46,  1.04it/s]

62611    Parma Calcio 1913    Parma F.C.


 61%|██████    | 3426/5598 [1:19:12<1:03:56,  1.77s/it]

520651    Palermo F.C.    U.S. Città di Palermo


 61%|██████▏   | 3432/5598 [1:19:20<44:53,  1.24s/it]  

3073616    S.S.D. Fidelis Andria 2018    A.S. Andria BAT


 61%|██████▏   | 3433/5598 [1:19:22<47:54,  1.33s/it]

1136763    Piacenza Calcio 1919    Piacenza Calcio


 61%|██████▏   | 3436/5598 [1:19:25<37:40,  1.05s/it]  

3149150


 61%|██████▏   | 3439/5598 [1:19:30<51:30,  1.43s/it]

2592452    A.C. Perugia Calcio    Perugia Calcio


 61%|██████▏   | 3440/5598 [1:19:32<49:53,  1.39s/it]

5071886    Paulo Sérgio (footballer, born 1969)    Paulo Sérgio Silvestre do Nascimento


 62%|██████▏   | 3444/5598 [1:19:35<30:46,  1.17it/s]

3071399


 62%|██████▏   | 3446/5598 [1:19:38<35:46,  1.00it/s]

2883907


 62%|██████▏   | 3447/5598 [1:19:39<45:09,  1.26s/it]

1136820    L.R. Vicenza Virtus    Vicenza Calcio


 62%|██████▏   | 3451/5598 [1:19:44<45:27,  1.27s/it]

1138693    S.S.C. Bari    A.S. Bari


 62%|██████▏   | 3454/5598 [1:19:49<48:15,  1.35s/it]  

2870603    A.C. Monza    A.C. Monza Brianza 1912


 62%|██████▏   | 3457/5598 [1:19:56<1:03:05,  1.77s/it]

2876568    Ravenna F.C.    Ravenna Calcio


 62%|██████▏   | 3460/5598 [1:20:00<58:19,  1.64s/it]  

553772    Robert Pires    Robert Pirès


 62%|██████▏   | 3470/5598 [1:20:34<1:57:01,  3.30s/it]

13121884    Abdelkader Ferhaoui    Kader Ferhaoui


 62%|██████▏   | 3487/5598 [1:20:55<38:21,  1.09s/it]  

2138844


 63%|██████▎   | 3502/5598 [1:21:08<18:11,  1.92it/s]  

18298102    Princeton Lyman    Princeton N. Lyman


 63%|██████▎   | 3508/5598 [1:21:16<40:40,  1.17s/it]

2302768    Morinville    Morinville, Alberta


 63%|██████▎   | 3509/5598 [1:21:17<40:34,  1.17s/it]

5103900    Nordegg    Nordegg, Alberta


 63%|██████▎   | 3510/5598 [1:21:17<32:12,  1.08it/s]

380532    Biogen    Biogen Idec


 63%|██████▎   | 3511/5598 [1:21:22<1:17:23,  2.23s/it]

4383256


 63%|██████▎   | 3516/5598 [1:21:27<40:48,  1.18s/it]  

21835440    Mark L. Wolf    Mark Lawrence Wolf


 63%|██████▎   | 3520/5598 [1:21:33<46:52,  1.35s/it]  

10819    Federal Reserve    Federal Reserve System


 64%|██████▎   | 3560/5598 [1:22:28<50:35,  1.49s/it]  

3523938    World Conference on Women, 1995    Fourth World Conference on Women


 64%|██████▎   | 3562/5598 [1:22:30<37:07,  1.09s/it]

94089    Heathrow (hamlet)    Heathrow, London


 64%|██████▎   | 3563/5598 [1:22:31<32:52,  1.03it/s]

26308489


 64%|██████▎   | 3564/5598 [1:22:31<26:29,  1.28it/s]

321943    Sky UK    Sky Digital (UK & Ireland)


 64%|██████▍   | 3571/5598 [1:22:48<1:23:52,  2.48s/it]

15674115    Sharjah    Sharjah (city)


 64%|██████▍   | 3577/5598 [1:23:08<1:34:00,  2.79s/it]

233978    Vancouver Sun    The Vancouver Sun


 64%|██████▍   | 3578/5598 [1:23:11<1:35:26,  2.84s/it]

201835    Chris Patten    Chris Patten, Baron Patten of Barnes


 64%|██████▍   | 3579/5598 [1:23:15<1:51:32,  3.31s/it]

57744    Ivory Coast    Côte d'Ivoire


 64%|██████▍   | 3583/5598 [1:23:20<52:00,  1.55s/it]  

334800    Orlando Pirates F.C.    Orlando Pirates FC


 64%|██████▍   | 3592/5598 [1:23:28<13:07,  2.55it/s]  

241458    Charles Taylor (Liberian politician)    Charles Taylor (Liberia)


 64%|██████▍   | 3602/5598 [1:23:38<25:30,  1.30it/s]  

412602    American depositary receipt    American Depositary Receipt


 65%|██████▍   | 3617/5598 [1:24:01<36:50,  1.12s/it]  

1873300    Vale S.A.    Vale (mining company)


 65%|██████▍   | 3619/5598 [1:24:04<38:13,  1.16s/it]

1157753


 65%|██████▍   | 3623/5598 [1:24:05<16:47,  1.96it/s]

37988    United States National Guard    National Guard of the United States


 65%|██████▍   | 3626/5598 [1:24:09<23:30,  1.40it/s]

2154    African Americans    African American


 65%|██████▌   | 3659/5598 [1:25:00<47:36,  1.47s/it]  

2395785    ACC (company)    ACC Limited


 65%|██████▌   | 3662/5598 [1:25:01<24:51,  1.30it/s]

236852    Libor    London Interbank Offered Rate


 66%|██████▌   | 3686/5598 [1:25:41<22:15,  1.43it/s]  

63861    Valencia    Valencia, Spain
803    Arabic    Arabic language


 66%|██████▌   | 3696/5598 [1:25:59<53:19,  1.68s/it]  

291275


 66%|██████▋   | 3711/5598 [1:26:11<30:42,  1.02it/s]

2865375    Theodoros Pangalos (politician)    Theodoros Pangalos


 67%|██████▋   | 3728/5598 [1:26:39<1:06:08,  2.12s/it]

405559    PASOK    Panhellenic Socialist Movement


 67%|██████▋   | 3751/5598 [1:27:06<30:15,  1.02it/s]  

6658821    Stung Treng City    Stung Treng


 67%|██████▋   | 3763/5598 [1:27:20<33:59,  1.11s/it]

175780


 67%|██████▋   | 3766/5598 [1:27:22<24:37,  1.24it/s]

2286075    Legal Department (Hong Kong)    Legal Department


 67%|██████▋   | 3770/5598 [1:27:24<14:32,  2.10it/s]

2629368    Banco Santander    Grupo Santander


 67%|██████▋   | 3771/5598 [1:27:25<16:16,  1.87it/s]

6517516    Expansión (Spanish newspaper)    Expansión


 67%|██████▋   | 3772/5598 [1:27:27<29:20,  1.04it/s]

399571    Abolhassan Banisadr    Abulhassan Banisadr


 68%|██████▊   | 3793/5598 [1:27:51<24:24,  1.23it/s]

11100004    Michael Andersson (cyclist)    Michael Andersson


 68%|██████▊   | 3803/5598 [1:28:04<32:30,  1.09s/it]

6691515    Oksana Grishina (cyclist)    Oksana Grishina


 68%|██████▊   | 3817/5598 [1:28:16<16:42,  1.78it/s]

2642664    Bristol County Ground    County Cricket Ground, Bristol


 68%|██████▊   | 3829/5598 [1:28:36<36:13,  1.23s/it]  

15820924


 69%|██████▊   | 3847/5598 [1:28:59<57:31,  1.97s/it]

89050    Flushing Meadows–Corona Park    Flushing Meadows – Corona Park


 69%|██████▉   | 3875/5598 [1:29:38<25:43,  1.12it/s]  

1917423    Bandundu (city)    Bandundu


 69%|██████▉   | 3890/5598 [1:29:56<33:56,  1.19s/it]

14431526    Čelopek, Brvenica    Čelopek


 70%|██████▉   | 3891/5598 [1:29:57<31:28,  1.11s/it]

495612    Victor Babeș    Victor Babeş


 70%|██████▉   | 3900/5598 [1:30:06<16:19,  1.73it/s]

26099


 70%|██████▉   | 3918/5598 [1:30:34<40:36,  1.45s/it]  

2136403    Piper Sandler Companies    Piper Jaffray


 70%|███████   | 3925/5598 [1:30:44<52:03,  1.87s/it]

920048    West South Central states    West South Central States


 70%|███████   | 3926/5598 [1:30:45<44:43,  1.60s/it]

920073    East North Central states    East North Central States


 70%|███████   | 3931/5598 [1:30:55<38:07,  1.37s/it]  

430438    Ron Goldman    Ronald Goldman


 70%|███████   | 3936/5598 [1:30:59<17:12,  1.61it/s]

607797    Miami Herald    The Miami Herald


 71%|███████   | 3952/5598 [1:31:19<15:43,  1.74it/s]  

252976
4665846    MCB Bank    Muslim Commercial Bank


 71%|███████   | 3962/5598 [1:31:30<23:56,  1.14it/s]

50413    Sierra Nevada    Sierra Nevada (U.S.)


 71%|███████   | 3971/5598 [1:31:39<27:54,  1.03s/it]

7028227    Segro    SEGRO


 71%|███████   | 3973/5598 [1:31:41<26:19,  1.03it/s]

2049638    Landsec    Land Securities


 71%|███████   | 3978/5598 [1:31:45<19:48,  1.36it/s]

6226592    Jordan News Agency    Petra News Agency
530357


 71%|███████   | 3983/5598 [1:31:48<14:07,  1.91it/s]

19623898    Islamic Museum, Jerusalem    Islamic Museum


 72%|███████▏  | 4009/5598 [1:32:17<28:15,  1.07s/it]

3017887    France-Soir    France Soir


 72%|███████▏  | 4018/5598 [1:32:27<31:12,  1.18s/it]

66513


 72%|███████▏  | 4019/5598 [1:32:28<24:30,  1.07it/s]

5069516    HIV/AIDS    AIDS


 72%|███████▏  | 4031/5598 [1:32:43<29:12,  1.12s/it]

369460    TVNZ    Television New Zealand


 72%|███████▏  | 4035/5598 [1:32:45<12:38,  2.06it/s]

2823064    MUFG Bank    The Bank of Tokyo-Mitsubishi UFJ


 72%|███████▏  | 4037/5598 [1:32:47<19:08,  1.36it/s]

3109012    LG Uplus    LG Telecom


 72%|███████▏  | 4042/5598 [1:32:55<33:53,  1.31s/it]

24086495    Itami Airport    Osaka International Airport


 72%|███████▏  | 4047/5598 [1:33:02<30:04,  1.16s/it]

4199619    Industrial production index    Industrial Production Index


 72%|███████▏  | 4048/5598 [1:33:02<28:00,  1.08s/it]

539739    GAL (paramilitary group)    Grupos Antiterroristas de Liberación


 72%|███████▏  | 4055/5598 [1:33:10<20:10,  1.27it/s]

54537


 73%|███████▎  | 4066/5598 [1:33:30<44:33,  1.75s/it]  

8908579    Scott Young (Welsh footballer)    Scott Young (footballer)


 73%|███████▎  | 4068/5598 [1:33:31<31:22,  1.23s/it]

18078979    Tetyana Styazhkina    Tatiana Stiajkina


 73%|███████▎  | 4078/5598 [1:33:44<38:12,  1.51s/it]

20108    Mick Doohan    Michael Doohan


 73%|███████▎  | 4086/5598 [1:33:53<28:11,  1.12s/it]

12185077    Shinichi Ito    Shinichi Itoh


 73%|███████▎  | 4107/5598 [1:34:10<28:01,  1.13s/it]

88261    Tony Adams    Tony Adams (footballer)


 74%|███████▎  | 4128/5598 [1:34:40<31:44,  1.30s/it]

5215321    Greater Milwaukee Open    U.S. Bank Championship in Milwaukee


 74%|███████▍  | 4149/5598 [1:35:00<28:15,  1.17s/it]

5208246    Atlanta Classic    AT&T Classic


 74%|███████▍  | 4151/5598 [1:35:02<25:44,  1.07s/it]

5221418    Tucson Open    Chrysler Classic of Tucson


 74%|███████▍  | 4154/5598 [1:35:07<27:29,  1.14s/it]

700197    Mario Silva (politician)    Mario Silva


 74%|███████▍  | 4161/5598 [1:35:12<22:59,  1.04it/s]

6311518    Jack Pierce (hurdler)    Jack Pierce (athlete)


 75%|███████▍  | 4179/5598 [1:35:26<14:22,  1.65it/s]

25826152    Eric Thomas (hurdler)    Eric Thomas (athlete)


 75%|███████▌  | 4199/5598 [1:35:45<25:22,  1.09s/it]

972840    Mike Conley Sr.    Mike Conley, Sr.


 75%|███████▌  | 4209/5598 [1:35:57<33:50,  1.46s/it]

86444    World Athletics    International Association of Athletics Federations


 75%|███████▌  | 4217/5598 [1:36:05<19:22,  1.19it/s]

22829445


 76%|███████▌  | 4234/5598 [1:36:24<31:50,  1.40s/it]

230102    NYSE Chicago    Chicago Stock Exchange


 76%|███████▌  | 4259/5598 [1:37:03<39:32,  1.77s/it]

622772    Chechens    Chechen people


 76%|███████▋  | 4269/5598 [1:37:20<40:32,  1.83s/it]

53029    Muammar Gaddafi    Muammar al-Gaddafi
81024    Pan Am    Pan American World Airways


 76%|███████▋  | 4273/5598 [1:37:23<28:31,  1.29s/it]

317442    Idris of Libya    Idris I of Libya


 77%|███████▋  | 4287/5598 [1:37:35<12:52,  1.70it/s]

3346024


 77%|███████▋  | 4296/5598 [1:37:44<18:07,  1.20it/s]

1429506    Carroll A. Campbell Jr.    Carroll A. Campbell, Jr.


 77%|███████▋  | 4299/5598 [1:37:46<17:08,  1.26it/s]

19285924    Titanic    RMS Titanic


 77%|███████▋  | 4302/5598 [1:37:52<27:19,  1.26s/it]

105927    Charles de Gaulle Airport    Paris-Charles de Gaulle Airport


 77%|███████▋  | 4305/5598 [1:37:58<42:49,  1.99s/it]

50118    Groningen    Groningen (city)


 77%|███████▋  | 4308/5598 [1:38:03<37:44,  1.76s/it]

4757052    Al Akhbar (Egypt)    Elakhbar


 77%|███████▋  | 4310/5598 [1:38:06<34:45,  1.62s/it]

11311204


 77%|███████▋  | 4312/5598 [1:38:07<21:27,  1.00s/it]

1922153    Raymond James Financial    Raymond James


 77%|███████▋  | 4316/5598 [1:38:13<22:35,  1.06s/it]

3615109    CITIC Limited    CITIC Pacific


 77%|███████▋  | 4323/5598 [1:38:19<15:16,  1.39it/s]

1000530    Republic of Artsakh    Nagorno-Karabakh Republic


 77%|███████▋  | 4328/5598 [1:38:22<08:58,  2.36it/s]

41268    Intelligent Network    Intelligent network


 78%|███████▊  | 4340/5598 [1:38:38<45:39,  2.18s/it]

454123    Roberto Carlos    Roberto Carlos (footballer)


 78%|███████▊  | 4342/5598 [1:38:42<42:26,  2.03s/it]

6384006    Donato (footballer)    Donato Gama da Silva


 78%|███████▊  | 4343/5598 [1:38:42<31:05,  1.49s/it]

12345223    Luis Milla (footballer, born 1966)    Luis Milla


 78%|███████▊  | 4358/5598 [1:39:14<38:24,  1.86s/it]  

3501496    Peter Dubovský (footballer)    Peter Dubovský


 78%|███████▊  | 4394/5598 [1:39:50<25:20,  1.26s/it]

372723    Frank Thomas (designated hitter)    Frank Thomas (baseball, born 1968)


 79%|███████▉  | 4414/5598 [1:40:09<11:18,  1.75it/s]

1463014    Bristol Bears    Bristol Rugby


 79%|███████▉  | 4435/5598 [1:40:32<18:11,  1.07it/s]

3019555    Watsonian FC    Watsonians RFC


 79%|███████▉  | 4441/5598 [1:40:39<17:40,  1.09it/s]

4410583    John Robinson (footballer, born 1971)    John Robinson (footballer)


 80%|███████▉  | 4460/5598 [1:41:03<13:52,  1.37it/s]

768194    Al Unser Jr.    Al Unser, Jr.


 80%|███████▉  | 4461/5598 [1:41:03<13:21,  1.42it/s]

5421344    André Ribeiro (racing driver)    André Ribeiro


 80%|███████▉  | 4476/5598 [1:41:19<17:39,  1.06it/s]

21851679    James Dalton (rugby union)    James Dalton (rugby player)


 80%|████████  | 4492/5598 [1:41:47<35:14,  1.91s/it]  

2840429    Daniel Andersson (footballer, born 1977)    Daniel Andersson (footballer born 1977)


 80%|████████  | 4501/5598 [1:42:02<34:54,  1.91s/it]

1108893    Billie Jean King Cup    Fed Cup


 81%|████████▏ | 4560/5598 [1:43:29<17:59,  1.04s/it]  

961517    Andi Herzog    Andreas Herzog


 82%|████████▏ | 4563/5598 [1:43:32<16:43,  1.03it/s]

835173    Tom Boyd (Scottish footballer)    Tom Boyd (footballer)


 82%|████████▏ | 4568/5598 [1:43:38<23:02,  1.34s/it]

1082043    John Collins (footballer, born 1968)    John Collins (footballer)


 82%|████████▏ | 4570/5598 [1:43:40<18:29,  1.08s/it]

8484278    International Boxing Association (amateur)    International Boxing Association


 82%|████████▏ | 4575/5598 [1:43:50<33:18,  1.95s/it]

6583316


 82%|████████▏ | 4593/5598 [1:44:18<29:35,  1.77s/it]

1360541    Ramón Ramírez (footballer)    Ramón Ramírez


 82%|████████▏ | 4595/5598 [1:44:21<23:30,  1.41s/it]

6987514    Enrique Alfaro Rojas    Enrique Alfaro


 82%|████████▏ | 4604/5598 [1:44:48<26:59,  1.63s/it]  

27565952    Luís Oliveira    Luis Oliveira


 82%|████████▏ | 4612/5598 [1:44:54<11:06,  1.48it/s]

3044729    Nico Van Kerckhoven    Nico van Kerckhoven


 83%|████████▎ | 4628/5598 [1:45:12<19:36,  1.21s/it]

6647379    Nate Miller (boxer)    Nate Miller


 83%|████████▎ | 4629/5598 [1:45:12<16:39,  1.03s/it]

28148156    James Heath (boxer)    James Heath (Boxer)


 83%|████████▎ | 4631/5598 [1:45:13<11:39,  1.38it/s]

11047345    Richard Hannon Sr.    Richard Hannon


 83%|████████▎ | 4633/5598 [1:45:14<07:37,  2.11it/s]

884389


 83%|████████▎ | 4641/5598 [1:45:32<28:34,  1.79s/it]  

10128    Elizabeth I    Elizabeth I of England


 83%|████████▎ | 4642/5598 [1:45:36<36:59,  2.32s/it]

14187    Henry VIII    Henry VIII of England


 83%|████████▎ | 4648/5598 [1:45:47<26:49,  1.69s/it]

2843060    Treaty of Baden (1714)    Treaty of Baden


 83%|████████▎ | 4659/5598 [1:45:56<12:11,  1.28it/s]

69880    Napoleon    Napoleon I


 83%|████████▎ | 4666/5598 [1:46:14<32:18,  2.08s/it]

240868    Transvaal (province)    Transvaal Province


 83%|████████▎ | 4670/5598 [1:46:23<38:18,  2.48s/it]

25310    Qing dynasty    Qing Dynasty


 84%|████████▎ | 4676/5598 [1:46:31<21:01,  1.37s/it]

639729    Ice Cold in Alex    Ice-Cold in Alex


 84%|████████▎ | 4687/5598 [1:46:55<31:52,  2.10s/it]

2701987    Scottish Labour    Scottish Labour Party


 84%|████████▍ | 4696/5598 [1:47:00<08:22,  1.79it/s]

18947965    Afrikaners    Afrikaner


 84%|████████▍ | 4697/5598 [1:47:01<10:13,  1.47it/s]

15608060


 84%|████████▍ | 4715/5598 [1:47:26<20:08,  1.37s/it]

51572    Santiago    Santiago, Chile


 84%|████████▍ | 4723/5598 [1:47:38<31:53,  2.19s/it]

7175    Chinese Communist Party    Communist Party of China


 85%|████████▍ | 4736/5598 [1:47:58<17:15,  1.20s/it]

174467    Yonhap News Agency    Yonhap


 85%|████████▍ | 4754/5598 [1:48:18<21:12,  1.51s/it]

197352    USS Carl Vinson    USS Carl Vinson (CVN-70)


 85%|████████▌ | 4766/5598 [1:48:30<07:47,  1.78it/s]

14732    Irish Republican Army (1919–1922)    Irish Republican Army


 85%|████████▌ | 4784/5598 [1:49:00<26:03,  1.92s/it]

887850    China national football team    China PR national football team


 86%|████████▌ | 4798/5598 [1:49:25<14:32,  1.09s/it]

1018627    South Korea national football team    Korea Republic national football team


 86%|████████▌ | 4806/5598 [1:49:38<14:40,  1.11s/it]

795068    Diego Domínguez (rugby union)    Diego Domínguez


 86%|████████▌ | 4827/5598 [1:49:59<14:13,  1.11s/it]

25450443    FIS Freestyle Ski World Cup    FIS Freestyle Skiing World Cup


 87%|████████▋ | 4845/5598 [1:50:17<09:33,  1.31it/s]

540056    EuroLeague    Euroleague Basketball


 87%|████████▋ | 4848/5598 [1:50:28<32:30,  2.60s/it]

1973041    Limoges CSP    CSP Limoges


 87%|████████▋ | 4852/5598 [1:50:38<31:53,  2.56s/it]

4739242    Alba Berlin    ALBA Berlin


 87%|████████▋ | 4855/5598 [1:50:47<38:40,  3.12s/it]

7116249    Panathinaikos B.C.    Panathinaikos BC


 87%|████████▋ | 4857/5598 [1:50:50<25:55,  2.10s/it]

1969800    ASVEL Basket    ASVEL Lyon-Villeurbanne
3313915


 87%|████████▋ | 4860/5598 [1:50:57<28:55,  2.35s/it]

1396057    Anadolu Efes S.K.    Efes Pilsen S.K.


 87%|████████▋ | 4861/5598 [1:51:00<28:26,  2.32s/it]

3830347    Élan Béarnais    Élan Béarnais Pau-Orthez


 87%|████████▋ | 4863/5598 [1:51:06<33:16,  2.72s/it]

4127601    Real Betis Baloncesto    CB Sevilla


 87%|████████▋ | 4870/5598 [1:51:14<11:47,  1.03it/s]

2417447


 87%|████████▋ | 4891/5598 [1:51:35<11:21,  1.04it/s]

15154926    David Wilson (rugby union, born 1985)    Dave Wilson (rugby union)


 87%|████████▋ | 4893/5598 [1:51:37<10:03,  1.17it/s]

5746768    Dan Crowley (rugby union)    Dan Crowley


 88%|████████▊ | 4905/5598 [1:51:52<25:12,  2.18s/it]

679799    North Macedonia national football team    Macedonia national football team


 88%|████████▊ | 4909/5598 [1:51:58<18:58,  1.65s/it]

2384790    FC Progresul București    AFC Progresul Bucureşti


 88%|████████▊ | 4914/5598 [1:52:03<13:01,  1.14s/it]

5212074    Anton Doboș    Anton Doboş


 88%|████████▊ | 4920/5598 [1:52:14<17:51,  1.58s/it]

5997909    Ovidiu Stîngă    Ovidiu Stângă


 88%|████████▊ | 4935/5598 [1:52:28<06:57,  1.59it/s]

613023    Tasmania cricket team    Tasmanian Tigers
1147963    Victoria cricket team    Victorian Bushrangers


 88%|████████▊ | 4947/5598 [1:52:38<07:37,  1.42it/s]

3099485    Kenny Benjamin    Kenneth Benjamin


 88%|████████▊ | 4949/5598 [1:52:40<08:31,  1.27it/s]

8735759    World Badminton Grand Prix Finals    World Badminton Grand Prix


 89%|████████▊ | 4961/5598 [1:53:00<17:41,  1.67s/it]

919728    List of national stadiums    National stadium


 89%|████████▊ | 4962/5598 [1:53:01<15:44,  1.48s/it]

26530226    Arab Contractors    Arab Contractors (company)


 89%|████████▉ | 4981/5598 [1:53:34<09:13,  1.11it/s]

26722293


 89%|████████▉ | 4983/5598 [1:53:39<15:48,  1.54s/it]

73138    Arizona Coyotes    Phoenix Coyotes


 89%|████████▉ | 4999/5598 [1:54:05<11:30,  1.15s/it]

33673    Washington Football Team    Washington Redskins


 90%|████████▉ | 5017/5598 [1:54:33<25:45,  2.66s/it]

72852    Brooklyn Nets    New Jersey Nets


 90%|████████▉ | 5022/5598 [1:54:42<12:01,  1.25s/it]

72862    New Orleans Pelicans    New Orleans Hornets


 90%|█████████ | 5047/5598 [1:55:23<14:00,  1.52s/it]

27170    Los Angeles Chargers    San Diego Chargers


 90%|█████████ | 5048/5598 [1:55:23<10:45,  1.17s/it]

22312    Las Vegas Raiders    Oakland Raiders


 91%|█████████ | 5094/5598 [1:56:22<12:21,  1.47s/it]

9993504    Paulo Alves (footballer, born 1969)    Paulo Alves


 91%|█████████ | 5097/5598 [1:56:24<06:26,  1.30it/s]

347480    Raúl (footballer)    Raúl González


 91%|█████████ | 5099/5598 [1:56:27<09:58,  1.20s/it]

5529728    Víctor Sánchez (footballer, born 1976)    Víctor Sánchez


 91%|█████████ | 5101/5598 [1:56:30<10:43,  1.29s/it]

5586995    José Amavisca    José Emilio Amavisca


 91%|█████████ | 5102/5598 [1:56:31<09:59,  1.21s/it]

6096292    Roger García    Roger García Junyent


 91%|█████████ | 5104/5598 [1:56:36<15:19,  1.86s/it]

616593    Luis Enrique    Luis Enrique Martínez García


 91%|█████████▏| 5116/5598 [1:56:56<07:52,  1.02it/s]

24119395    RC Celta de Vigo    Celta de Vigo


 91%|█████████▏| 5121/5598 [1:57:04<10:14,  1.29s/it]

2413021


 92%|█████████▏| 5153/5598 [1:57:57<06:22,  1.16it/s]

466604    World Squash Championships    World Open


 92%|█████████▏| 5163/5598 [1:58:11<11:36,  1.60s/it]

1203555    Democratic Alliance of Hungarians in Romania    Democratic Union of Hungarians in Romania


 92%|█████████▏| 5166/5598 [1:58:18<17:32,  2.44s/it]

60817    Timișoara    Timişoara


 92%|█████████▏| 5172/5598 [1:58:27<10:24,  1.47s/it]

23346    Post-communism    Post-Communism


 92%|█████████▏| 5175/5598 [1:58:30<10:03,  1.43s/it]

150133    Heineken N.V.    Heineken International


 93%|█████████▎| 5182/5598 [1:58:37<06:12,  1.12it/s]

4793292    Heineken    Heineken Pilsener


 93%|█████████▎| 5183/5598 [1:58:37<04:39,  1.48it/s]

941690    Competition Commission    Competition Commission (United Kingdom)


 93%|█████████▎| 5189/5598 [1:58:44<05:56,  1.15it/s]

5533386    John Gorst (Hendon North MP)    John Michael Gorst


 93%|█████████▎| 5192/5598 [1:58:46<04:35,  1.48it/s]

10052


 93%|█████████▎| 5194/5598 [1:58:48<04:26,  1.52it/s]

423831    NATS Holdings    National Air Traffic Services


 93%|█████████▎| 5204/5598 [1:59:01<05:42,  1.15it/s]

1100754    Newmont Corporation    Newmont Mining Corporation


 93%|█████████▎| 5211/5598 [1:59:06<05:52,  1.10it/s]

906034    Doug Young (politician)    Doug Young


 93%|█████████▎| 5216/5598 [1:59:13<07:54,  1.24s/it]

20246549    Walikale Territory    Walikale


 93%|█████████▎| 5228/5598 [1:59:22<04:31,  1.36it/s]

44514    Mahikeng    Mafikeng


 93%|█████████▎| 5232/5598 [1:59:32<11:55,  1.96s/it]

7183054    Nornickel    MMC Norilsk Nickel


 94%|█████████▎| 5248/5598 [1:59:48<03:27,  1.69it/s]

224093    Tampico    Tampico, Tamaulipas


 94%|█████████▍| 5266/5598 [2:00:04<06:13,  1.12s/it]

7760633    Bill Jordan, Baron Jordan    William Jordan, Baron Jordan


 94%|█████████▍| 5273/5598 [2:00:13<05:37,  1.04s/it]

193064    The Nikkei    Nihon Keizai Shimbun


 94%|█████████▍| 5277/5598 [2:00:18<07:15,  1.36s/it]

4559718    Catholicity    Catholicism


 95%|█████████▍| 5295/5598 [2:00:44<05:45,  1.14s/it]

79289    German Shepherd    German Shepherd Dog


 95%|█████████▍| 5298/5598 [2:00:45<02:45,  1.81it/s]

2139688    Dobermann    Doberman Pinscher


 95%|█████████▍| 5316/5598 [2:01:09<03:44,  1.26it/s]

319038


 95%|█████████▌| 5335/5598 [2:01:39<06:19,  1.44s/it]

4649203    James Love (NGO director)    James Love


 95%|█████████▌| 5336/5598 [2:01:43<08:51,  2.03s/it]

67397    Ulaanbaatar    Ulan Bator


 96%|█████████▌| 5351/5598 [2:01:58<03:51,  1.07it/s]

5571242    Tony Marlow    Antony Marlow


 96%|█████████▌| 5362/5598 [2:02:16<08:11,  2.08s/it]

3112720    Radio and Television of Bosnia and Herzegovina    Radiotelevision of Bosnia-Herzegovina


 96%|█████████▌| 5370/5598 [2:02:25<03:53,  1.02s/it]

10948329    Wenhui Bao    Wen Hui Bao


 96%|█████████▌| 5372/5598 [2:02:26<02:13,  1.70it/s]

183525


 96%|█████████▌| 5382/5598 [2:02:38<05:17,  1.47s/it]

2395764    Tops Friendly Markets    Tops Markets LLC


 97%|█████████▋| 5406/5598 [2:02:58<02:29,  1.28it/s]

229059    Women's PGA Championship    LPGA Championship


 97%|█████████▋| 5437/5598 [2:03:36<01:49,  1.47it/s]

5800960    Altenberg, Saxony    Altenberg, Germany


 97%|█████████▋| 5447/5598 [2:03:42<01:31,  1.65it/s]

13187649    Dirk Wiese (bobsleigh)    Dirk Wiese


 98%|█████████▊| 5477/5598 [2:04:07<02:45,  1.37s/it]

4364061    ISU Speed Skating World Cup    Speed Skating World Cup


 98%|█████████▊| 5503/5598 [2:04:39<02:39,  1.68s/it]

406831    Matt Le Tissier    Matthew Le Tissier


 99%|█████████▊| 5521/5598 [2:05:02<01:59,  1.55s/it]

872781    Dragons (rugby union)    Newport Gwent Dragons


 99%|█████████▊| 5528/5598 [2:05:11<01:03,  1.09it/s]

3483682    Ian Ferguson (footballer, born 1967)    Ian Ferguson (footballer born 1967)


 99%|█████████▉| 5532/5598 [2:05:13<00:35,  1.86it/s]

1044419    Francois Pienaar    François Pienaar


 99%|█████████▉| 5538/5598 [2:05:23<01:36,  1.60s/it]

2575478    Michael Branch (footballer)    Michael Branch


 99%|█████████▉| 5557/5598 [2:05:37<00:33,  1.22it/s]

646709    Ashford United F.C.    Ashford Town F.C. (Kent)


100%|█████████▉| 5577/5598 [2:05:53<00:22,  1.08s/it]

3565374    Hwang Sun-hong    Hwang Sun-Hong


100%|█████████▉| 5583/5598 [2:06:03<00:28,  1.91s/it]

5834903    Hapoel Be'er Sheva F.C.    Hapoel Be'er Sheva A.F.C.


100%|█████████▉| 5587/5598 [2:06:09<00:16,  1.47s/it]

243389    Charlotte Hornets    Charlotte Bobcats


100%|██████████| 5598/5598 [2:06:20<00:00,  1.35s/it]


In [99]:
# 'China', 'Taiwan', 'Frankfurt'

In [86]:
with open('id2desc.json', 'w') as fout:
    json.dump(id2desc_cache, fout)

In [92]:
sum_none = 0
for key in id2desc_cache:
    if id2desc_cache[key] is None:
        sum_none += 1

print(sum_none)

83


In [96]:
def process_data_wiki_content(filename, id2desc, debug=False):
    """
     Input: AIDA-YAGO2-dataset.tsv, columns: token, B_I tag, mention, YAGO2 entity, wiki url, wiki id 
     Output: processed dictionary of document, context of labels from first 128 tokens of wikipedia content
    """
    processed = {}

    open_entity = False

    with open(filename) as fin:
        lines = fin.readlines()
        if debug:
            lines = lines[:100]

        for line in tqdm(lines):
            if '-DOCSTART-' in line:
                # check end of entity (last sentence of doc)
                if open_entity:
                    processed[doc_id]['end_idxs'][-1] = 1
                    open_entity = False
                
                # new document
                doc_id = line.split("(")[-1][:-2]
                processed[doc_id] = {
                    'doc_id': doc_id,
                    'text': [],
                    'mentions': [],
                    'start_idxs': [],
                    'end_idxs': [],
                    'wiki_ids': [],
                    'wiki_titles': [],
                    'wiki_contexts': []
                }
            else:
                split = line.split('\t')
                token = split[0].strip()

                if len(token)==0: # space are eliminated
                    continue

                processed[doc_id]['text'] += [token]

                if len(split)>=5:
                    B_I = split[1]
                    mention = split[2]
                    wiki_url = split[4]
                    wiki_id = split[5].split('\n')[0]

                    if B_I == 'I':
                        processed[doc_id]['start_idxs'] += [0]
                        processed[doc_id]['end_idxs'] += [0]
                    elif B_I == 'B':
                        # check for consecutive mentions, such as B I B
                        if open_entity:
                            processed[doc_id]['end_idxs'][-1] = 1
                            open_entity = False
                        
                        # check for entity returned None for wikipedia content
                        # ignore this mention
                        if id2desc[wiki_id] is None:
                            processed[doc_id]['start_idxs'] += [0]
                            processed[doc_id]['end_idxs'] += [0]
                        else:
                            wiki_title = wiki_url.split('/')[-1].replace('_', ' ')
                            if wiki_id == '000':
                                if wiki_url in url2id_cache:
                                    wiki_id = urlid_cache[wiki_url]
                                else:
                                    wiki_id = _get_pageid_from_api(wiki_title)
                                    urlid_cache[wiki_url] = wiki_id
                            wiki_context = id2desc[wiki_id]
                            q = processed[doc_id]
                            q['mentions'] += [mention]
                            q['start_idxs'] += [1]
                            q['end_idxs'] += [0]
                            q['wiki_ids'] += [wiki_id]
                            q['wiki_titles'] += [wiki_title]
                            q['wiki_contexts'] += [wiki_context]
                            processed[doc_id] = q
                            open_entity = True
                    else:
                        print('Invalid B_I {}'.format(B_I))
                        sys.exit(-1)
                else:
                    if open_entity:
                        processed[doc_id]['end_idxs'][-1] = 1
                        open_entity = False
                    processed[doc_id]['start_idxs'] += [0]
                    processed[doc_id]['end_idxs'] += [0]
#     # final sentence
#     if open_entity:
#         processed[doc_id]['end_idxs'][-1] = 1
#         open_entity = False

    return processed
            

In [97]:
filename = 'AIDA-YAGO2-dataset.tsv'
processed_aida = process_data_wiki_content(filename, id2desc_cache)

100%|██████████| 323395/323395 [00:05<00:00, 56623.98it/s] 


In [98]:
# sanity check of processed data
for key, d in tqdm(processed_aida.items()):
    try:
        assert len(d['text'])==len(d['start_idxs'])
        assert len(d['start_idxs'])==len(d['end_idxs'])
        assert len(d['mentions'])==sum(d['start_idxs'])
        assert len(d['mentions'])==sum(d['end_idxs'])
        assert len(d['wiki_ids'])==len(d['mentions'])
        assert len(d['wiki_titles'])==len(d['wiki_ids'])
        assert len(d['wiki_ids'])==len(d['wiki_contexts'])
    except:
        print(key)

100%|██████████| 1393/1393 [00:00<00:00, 126063.51it/s]


In [100]:
train = []
testa = []
testb = []

for key, q in tqdm(processed_aida.items()):
    if 'testa' in key:
        testa.append(q)
    elif 'testb' in key:
        testb.append(q)
    else:
        train.append(q)

print("train: {}".format(len(train)))
print("testa: {}".format(len(testa)))
print("testb: {}".format(len(testb)))

100%|██████████| 1393/1393 [00:00<00:00, 312826.76it/s]

train: 946
testa: 216
testb: 231





In [101]:
def mean(l):
    return sum(l)/len(l)

avg_token_train = mean([len(item['text']) for item in train])
avg_token_testa = mean([len(item['text']) for item in testa])
avg_token_testb = mean([len(item['text']) for item in testb])

print(avg_token_train, avg_token_testa, avg_token_testb)

215.24418604651163 237.78703703703704 201.017316017316


In [102]:
def add_mention_ranges(items):
    for item in tqdm(items):
        start_position = [i for i in range(len(item['start_idxs'])) if item['start_idxs'][i]==1]
        end_position = [i for i in range(len(item['end_idxs'])) if item['end_idxs'][i]==1]
        assert len(start_position)==len(end_position)
        span_position = [[start_position[i], end_position[i]] for i in range(len(start_position))]
        item['start_position'] = start_position
        item['end_position'] = end_position
        item['span_position'] = span_position
    return items

In [103]:
train = add_mention_ranges(train)

100%|██████████| 946/946 [00:00<00:00, 9556.71it/s]


In [104]:
dev = add_mention_ranges(testa)
test = add_mention_ranges(testb)

100%|██████████| 216/216 [00:00<00:00, 15476.34it/s]
100%|██████████| 231/231 [00:00<00:00, 21717.04it/s]


In [105]:
fpath = 'AIDA-YAGO2-wiki_content'

fname = os.path.join(fpath, 'train.json')
with open(fname, 'w') as fout:
    json.dump(train, fout)

fname = os.path.join(fpath, 'dev.json')
with open(fname, 'w') as fout:
    json.dump(dev, fout)

fname = os.path.join(fpath, 'test.json')
with open(fname, 'w') as fout:
    json.dump(test, fout)

## Data with en_description from Kensho wiki data

In [7]:
# import wiki knowledge (en_description)
all_df = pd.read_csv('wiki/all_df.csv')
all_df

Unnamed: 0.1,Unnamed: 0,page_id,item_id,title,views,en_label,en_description
0,0,12,6199,Anarchism,31335,anarchism,political ideology that holds all government –...
1,1,25,38404,Autism,49693,autism,neurodevelopmental condition
2,2,39,101038,Albedo,14573,albedo,ratio of reflected radiation to incident radia...
3,3,290,9659,A,25859,A,letter of the Latin alphabet
4,4,303,173,Alabama,52765,Alabama,state of the United States of America
...,...,...,...,...,...,...,...
5362169,5362169,62470350,76894635,Daming Zhu,16,Daming Zhu,
5362170,5362170,62470423,76894633,Tony Dews,7,Tony Dews,American football coach
5362171,5362171,62470432,76896959,Samsung PL20,9,Samsung PL20,
5362172,5362172,62470465,6034153,Nils-Fredrik Palmstierna,8,Nils-Fredrik Palmstierna,


In [11]:
page_ids = all_df.page_id.values
page_ids

array([      12,       25,       39, ..., 62470432, 62470465, 62473330])

In [18]:
all_df_no_na = all_df.dropna(subset=['en_description'])
all_df_no_na

Unnamed: 0.1,Unnamed: 0,page_id,item_id,title,views,en_label,en_description
0,0,12,6199,Anarchism,31335,anarchism,political ideology that holds all government –...
1,1,25,38404,Autism,49693,autism,neurodevelopmental condition
2,2,39,101038,Albedo,14573,albedo,ratio of reflected radiation to incident radia...
3,3,290,9659,A,25859,A,letter of the Latin alphabet
4,4,303,173,Alabama,52765,Alabama,state of the United States of America
...,...,...,...,...,...,...,...
5362162,5362162,62470001,54862857,James Daniel Collins,6,James Daniel Collins,1917-1985
5362164,5362164,62470047,76852005,Tie Vapauteen (album),12,Tie vapauteen,album by Paleface & Laulava unioni
5362165,5362165,62470119,76891681,Sumner Heights and Hazelwood Valley Railroad,16,Sumner Heights and Hazelwood Valley Railroad,minimum gauge railway near Boston
5362170,5362170,62470423,76894633,Tony Dews,7,Tony Dews,American football coach


In [16]:
all_df.loc[all_df.page_id==25]['en_description'].values[0]

'neurodevelopmental condition'

In [17]:
sum(pd.isna(all_df['en_description']))

1491956

## todo:
### for single word mention, would start idx and end idx the same?

#### currently: the same

In [47]:
def process_data_en_description(filename, all_df, debug=False):
    """
     Input: AIDA-YAGO2-dataset.tsv, columns: token, B_I tag, mention, YAGO2 entity, wiki url, wiki id 
     Output: processed dictionary of document, context of labels from en_description
    """
    processed = {}

    open_entity = False
    
    # all_df: no N/A value in en_description
    page_ids = all_df.page_id.values

    with open(filename) as fin:
        lines = fin.readlines()
        if debug:
            lines = lines[:100]

        for line in tqdm(lines):
            if '-DOCSTART-' in line:
                # check end of entity (last sentence of doc)
                if open_entity:
                    processed[doc_id]['end_idxs'][-1] = 1
                    open_entity = False
                
                # new document
                doc_id = line.split("(")[-1][:-2]
                processed[doc_id] = {
                    'doc_id': doc_id,
                    'text': [],
                    'mentions': [],
                    'start_idxs': [],
                    'end_idxs': [],
                    'wiki_ids': [],
                    'wiki_titles': [],
                    'wiki_contexts': []
                }
            else:
                split = line.split('\t')
                token = split[0].strip()

                if len(token)==0: # space are eliminated
                    continue

                processed[doc_id]['text'] += [token]

                if len(split)>=5:
                    B_I = split[1]
                    mention = split[2]
                    wiki_url = split[4]
                    wiki_id = int(split[5].split('\n')[0])

                    if B_I == 'I':
                        processed[doc_id]['start_idxs'] += [0]
                        processed[doc_id]['end_idxs'] += [0]
                    elif B_I == 'B':
                        # check for consecutive mentions, such as B I B
                        if open_entity:
                            processed[doc_id]['end_idxs'][-1] = 1
                            open_entity = False
                        
                        # check for entity not in Kensho wiki knowledge base
                        # ignore this mention
                        if wiki_id not in page_ids:
                            processed[doc_id]['start_idxs'] += [0]
                            processed[doc_id]['end_idxs'] += [0]
                        else:
                            wiki_title = wiki_url.split('/')[-1].replace('_', ' ')
                            if str(wiki_id) == '000':
                                if wiki_url in url2id_cache:
                                    wiki_id = urlid_cache[wiki_url]
                                else:
                                    wiki_id = _get_pageid_from_api(wiki_title)
                                    urlid_cache[wiki_url] = wiki_id
                            wiki_context = all_df.loc[all_df.page_id==wiki_id]['en_description'].values[0]
                            q = processed[doc_id]
                            q['mentions'] += [mention]
                            q['start_idxs'] += [1]
                            q['end_idxs'] += [0]
                            q['wiki_ids'] += [wiki_id]
                            q['wiki_titles'] += [wiki_title]
                            q['wiki_contexts'] += [wiki_context]
                            processed[doc_id] = q
                            open_entity = True
                    else:
                        print('Invalid B_I {}'.format(B_I))
                        sys.exit(-1)
                else:
                    if open_entity:
                        processed[doc_id]['end_idxs'][-1] = 1
                        open_entity = False
                    processed[doc_id]['start_idxs'] += [0]
                    processed[doc_id]['end_idxs'] += [0]
#     # final sentence
#     if open_entity:
#         processed[doc_id]['end_idxs'][-1] = 1
#         open_entity = False

    return processed
            

In [48]:
filename = 'AIDA-YAGO2-dataset.tsv'
#processed_aida = process_data(filename, debug=True)
#processed_aida = process_data(filename)
processed_aida = process_data_en_description(filename, all_df_no_na)

100%|██████████| 323395/323395 [04:05<00:00, 1316.89it/s]


In [49]:
# sanity check of processed data
for key, d in tqdm(processed_aida.items()):
    try:
        assert len(d['text'])==len(d['start_idxs'])
        assert len(d['start_idxs'])==len(d['end_idxs'])
        assert len(d['mentions'])==sum(d['start_idxs'])
        assert len(d['mentions'])==sum(d['end_idxs'])
        assert len(d['wiki_ids'])==len(d['mentions'])
        assert len(d['wiki_titles'])==len(d['wiki_ids'])
        assert len(d['wiki_ids'])==len(d['wiki_contexts'])
    except:
        print(key)

100%|██████████| 1393/1393 [00:00<00:00, 56534.45it/s]


In [50]:
train = []
testa = []
testb = []

for key, q in tqdm(processed_aida.items()):
    if 'testa' in key:
        testa.append(q)
    elif 'testb' in key:
        testb.append(q)
    else:
        train.append(q)

print("train: {}".format(len(train)))
print("testa: {}".format(len(testa)))
print("testb: {}".format(len(testb)))

100%|██████████| 1393/1393 [00:00<00:00, 269123.24it/s]

train: 946
testa: 216
testb: 231





In [51]:
def mean(l):
    return sum(l)/len(l)

In [52]:
avg_token_train = mean([len(item['text']) for item in train])
avg_token_testa = mean([len(item['text']) for item in testa])
avg_token_testb = mean([len(item['text']) for item in testb])

In [53]:
print(avg_token_train, avg_token_testa, avg_token_testb)

215.24418604651163 237.78703703703704 201.017316017316


In [54]:
def add_mention_ranges(items):
    for item in tqdm(items):
        start_position = [i for i in range(len(item['start_idxs'])) if item['start_idxs'][i]==1]
        end_position = [i for i in range(len(item['end_idxs'])) if item['end_idxs'][i]==1]
        assert len(start_position)==len(end_position)
        span_position = [[start_position[i], end_position[i]] for i in range(len(start_position))]
        item['start_position'] = start_position
        item['end_position'] = end_position
        item['span_position'] = span_position
    return items

In [55]:
train = add_mention_ranges(train)

100%|██████████| 946/946 [00:00<00:00, 17302.28it/s]


In [56]:
dev = add_mention_ranges(testa)

100%|██████████| 216/216 [00:00<00:00, 15258.69it/s]


In [57]:
test = add_mention_ranges(testb)

100%|██████████| 231/231 [00:00<00:00, 11137.88it/s]


In [58]:
fpath = 'AIDA-YAGO2-en_desc'

fname = os.path.join(fpath, 'train.json')
with open(fname, 'w') as fout:
    json.dump(train, fout)

fname = os.path.join(fpath, 'dev.json')
with open(fname, 'w') as fout:
    json.dump(dev, fout)

fname = os.path.join(fpath, 'test.json')
with open(fname, 'w') as fout:
    json.dump(test, fout)

### Check mention counts before and after add en_description/first 128 tokens

In [107]:
with open('AIDA-YAGO2/train.json') as f:
    train_orig = json.load(f)

with open('AIDA-YAGO2-en_desc/train.json') as f:
    train_en = json.load(f)

with open('AIDA-YAGO2-wiki_content/train.json') as f:
    train_wiki = json.load(f)

In [108]:
train_orig_mentions = [len(i['mentions']) for i in train_orig]
train_en_mentions = [len(i['mentions']) for i in train_en]
train_wiki_mentions = [len(i['mentions']) for i in train_wiki]

In [109]:
comp_df = pd.DataFrame({'orig':train_orig_mentions, 'en_desc':train_en_mentions, 'wiki':train_wiki_mentions})
comp_df

Unnamed: 0,orig,en_desc,wiki
0,30,30,30
1,11,11,11
2,24,24,24
3,7,7,7
4,12,11,11
...,...,...,...
941,27,27,27
942,27,26,27
943,33,33,33
944,47,45,47


In [110]:
comp_df.describe()

Unnamed: 0,orig,en_desc,wiki
count,946.0,946.0,946.0
mean,19.599366,19.079281,19.35518
std,21.674191,21.528843,21.558648
min,0.0,0.0,0.0
25%,7.0,7.0,7.0
50%,13.0,13.0,13.0
75%,26.0,25.0,25.0
max,288.0,286.0,286.0


In [111]:
comp_df['en_diff'] = comp_df['orig']-comp_df['en_desc']
comp_df['wiki_diff'] = comp_df['orig']-comp_df['wiki']
comp_df

Unnamed: 0,orig,en_desc,wiki,en_diff,wiki_diff
0,30,30,30,0,0
1,11,11,11,0,0
2,24,24,24,0,0
3,7,7,7,0,0
4,12,11,11,1,1
...,...,...,...,...,...
941,27,27,27,0,0
942,27,26,27,1,0
943,33,33,33,0,0
944,47,45,47,2,0


In [112]:
comp_df['en_diff'].value_counts()

0     711
1     120
2      63
3      19
4      16
8       4
5       4
6       3
9       2
7       2
12      1
11      1
Name: en_diff, dtype: int64

In [114]:
comp_df['wiki_diff'].value_counts()

0     838
1      57
2      29
4       7
3       5
9       3
7       2
6       2
5       2
10      1
Name: wiki_diff, dtype: int64

### Manual check

In [115]:
list(train[0].keys())

['doc_id',
 'text',
 'mentions',
 'start_idxs',
 'end_idxs',
 'wiki_ids',
 'wiki_titles',
 'wiki_contexts',
 'start_position',
 'end_position',
 'span_position']

In [116]:
text = train[0]['text']
spans = train[0]['span_position']
mention = [text[span[0]:span[1]+1] for span in spans]
mention = [' '.join(m) for m in mention]

In [117]:
train[0]['mentions']

['German',
 'British',
 'BRUSSELS',
 'European Commission',
 'German',
 'British',
 'Germany',
 'European Union',
 'Britain',
 'Commission',
 'European Union',
 'Franz Fischler',
 'Britain',
 'France',
 'BSE',
 'Spanish',
 'Loyola de Palacio',
 'France',
 'Britain',
 'BSE',
 'British',
 'German',
 'British',
 'Europe',
 'Germany',
 'Bonn',
 'British',
 'Germany',
 'Britain',
 'British']

In [118]:
mention

['German',
 'British',
 'BRUSSELS',
 'European Commission',
 'German',
 'British',
 'Germany',
 'European Union',
 'Britain',
 'Commission',
 'European Union',
 'Franz Fischler',
 'Britain',
 'France',
 'BSE',
 'Spanish',
 'Loyola de Palacio',
 'France',
 'Britain',
 'BSE',
 'British',
 'German',
 'British',
 'Europe',
 'Germany',
 'Bonn',
 'British',
 'Germany',
 'Britain',
 'British']

In [119]:
train[0]['mentions'] == mention

True

In [120]:
train[0]['span_position']

[[2, 2],
 [6, 6],
 [11, 11],
 [14, 15],
 [22, 22],
 [28, 28],
 [43, 43],
 [48, 49],
 [66, 66],
 [94, 94],
 [129, 130],
 [142, 143],
 [179, 179],
 [181, 181],
 [193, 193],
 [237, 237],
 [240, 242],
 [266, 266],
 [268, 268],
 [314, 314],
 [330, 330],
 [350, 350],
 [357, 357],
 [363, 363],
 [382, 382],
 [404, 404],
 [420, 420],
 [438, 438],
 [443, 443],
 [459, 459]]

In [123]:
train[0]['wiki_contexts']

['germany ( german deutschland , german pronunciation [ˈdɔʏtʃlant] ) , officially the federal republic of germany ( german bundesrepublik deutschland , listen ) , is a country at the intersection of central and western europe . it is situated between the baltic and north seas to the north , and the alps to the south . it borders denmark to the north , poland and the czech republic to the east , austria and switzerland to the south , and france , luxembourg , belgium , and the netherlands to the west , and covers an area of 357 , 022 square kilometres ( 137 , 847 sq mi ) . various germanic tribes have inhabited the northern parts of modern germany since classical antiquity . a region',
 'the united kingdom of great britain and northern ireland , commonly known as the united kingdom ( uk or u . k . ) , or britain , is a sovereign country in north-western europe , off the north-\xadwestern coast of the european mainland . the united kingdom includes the island of great britain , the north

# Draft

In [87]:
# store on file
def store_data(processed_data, OUT_FILENAME):
    with open(OUT_FILENAME, "w+") as fout:
        for q in processed_data:
            json.dump(q, fout)
            fout.write("\n")

In [89]:
out_train_aida_filename = "AIDA-YAGO2_train.jsonl"
store_data(train, out_train_aida_filename)
out_testa_aida_filename = "AIDA-YAGO2_testa.jsonl"
store_data(testa, out_testa_aida_filename)
out_testb_aida_filename = "AIDA-YAGO2_testb.jsonl"
store_data(testb, out_testb_aida_filename)

In [85]:
# check = processed_aida['51 BASEBALL']
# check

In [52]:
sum(check['start_idxs'])

57

In [53]:
sum(check['end_idxs'])

56

In [54]:
len(check['text'])

266

In [55]:
len(check['end_idxs'])

266

In [56]:
import numpy as np

In [57]:
start = np.array(check['start_idxs'])
end = np.array(check['end_idxs'])

In [79]:
mention_ranges = []

for i in range(len(start)):
    if start[i]==1:
        mention_ranges.append([i])
        for j in range(i, len(end)):
            if end[j]==1:
                mention_ranges[-1].append(j+1)
                break

In [80]:
mention_ranges

[[2, 4],
 [10, 12],
 [13, 16],
 [36, 40],
 [44, 46],
 [50, 51],
 [55, 56],
 [60, 61],
 [65, 66],
 [71, 73],
 [73, 74],
 [78, 79],
 [84, 85],
 [90, 91],
 [96, 98],
 [103, 105],
 [105, 106],
 [110, 111],
 [115, 116],
 [120, 121],
 [131, 132],
 [133, 134],
 [134, 135],
 [136, 137],
 [137, 138],
 [139, 141],
 [141, 142],
 [143, 144],
 [144, 145],
 [146, 148],
 [148, 149],
 [150, 151],
 [151, 155],
 [159, 160],
 [164, 165],
 [169, 171],
 [176, 177],
 [181, 182],
 [186, 188],
 [188, 189],
 [199, 200],
 [204, 205],
 [210, 211],
 [216, 218],
 [218, 220],
 [224, 226],
 [231, 232],
 [236, 238],
 [251, 252],
 [252, 253],
 [254, 255],
 [255, 256],
 [257, 258],
 [258, 259],
 [260, 262],
 [262, 263],
 [264]]

In [81]:
for x in mention_ranges:
    try:
        print(check['text'][x[0]:x[1]])
    except:
        print(check['text'][x[0]])

['MAJOR', 'LEAGUE']
['NEW', 'YORK']
['Major', 'League', 'Baseball']
['AMERICAN', 'LEAGUE', 'EASTERN', 'DIVISION']
['NEW', 'YORK']
['BALTIMORE']
['BOSTON']
['TORONTO']
['DETROIT']
['CENTRAL', 'DIVISION']
['CLEVELAND']
['CHICAGO']
['MINNESOTA']
['MILWAUKEE']
['KANSAS', 'CITY']
['WESTERN', 'DIVISION']
['TEXAS']
['SEATTLE']
['OAKLAND']
['CALIFORNIA']
['OAKLAND']
['BOSTON']
['SEATTLE']
['BALTIMORE']
['CALIFORNIA']
['NEW', 'YORK']
['TORONTO']
['CHICAGO']
['DETROIT']
['KANSAS', 'CITY']
['TEXAS']
['MINNESOTA']
['NATIONAL', 'LEAGUE', 'EASTERN', 'DIVISION']
['ATLANTA']
['MONTREAL']
['NEW', 'YORK']
['FLORIDA']
['PHILADELPHIA']
['CENTRAL', 'DIVISION']
['HOUSTON']
['CHICAGO']
['CINCINNATI']
['PITTSBURGH']
['WESTERN', 'DIVISION']
['SAN', 'DIEGO']
['LOS', 'ANGELES']
['COLORADO']
['SAN', 'FRANCISCO']
['COLORADO']
['CINCINNATI']
['ATLANTA']
['PITTSBURGH']
['HOUSTON']
['PHILADELPHIA']
['LOS', 'ANGELES']
['MONTREAL']
SAN


In [72]:
#ranges = start+end

In [68]:
check['start_idxs'][50]

1

In [70]:
check['text'][40:55]

['W',
 'L',
 'PCT',
 'GB',
 'NEW',
 'YORK',
 '72',
 '53',
 '.576',
 '-',
 'BALTIMORE',
 '67',
 '58',
 '.536',
 '5']

In [59]:
ranges

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 2, 2, 0, 2, 2, 0, 1, 1, 2, 0, 2, 2, 0, 1, 1, 2, 0, 2, 1, 0, 0,
       1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 1, 1, 2, 0,
       1, 0])

In [40]:
mention_ranges = []
start=False

for i in range(len(ranges)):
    if ranges[i]==1:
        if start==False:
            mention_ranges.append([i])
            start=True
        else:
            mention_ranges[-1].append(i)
            start=False
    else:
        continue
        
    

In [47]:
for x in mention_ranges:
    try:
        print(check['text'][x[0]:x[1]])
    except:
        print(check['text'][x[0]])

['Israel']
['Syria']
['JERUSALEM']
['Israel']
['Syria']
['Itamar', 'Rabinovich']
['Israel']
['Washington']
['Syria']
['Israel', 'Radio']
['Damascus']
['Syrian']
['Syrians']
['Washington']
['Rabinovich']
['Eliahu', 'Ben-Elissar']
['Israeli']
['Egypt']
['Likud']
['Israel']
['Syria']
['Washington']
['Damascus']
['Syria']
['Israel']
['Israeli']
['Damascus']
['Syrian']
['Assad']
['Israel']
['Israeli']
['David', 'Levy']
['Israel', 'Radio']
['Israeli']
['Benjamin', 'Netanyahu']
['Golan', 'Heights']
['captured', 'from']
['in', 'the', '1967']
['war', '.', 'Israeli-Syrian', 'peace', 'talks', 'have', 'been', 'deadlocked', 'over', 'the']
['since', '1991', 'despite', 'the', 'previous', 'government', "'s", 'willingness', 'to', 'make']
['concessions', '.', 'Peace', 'talks', 'between', 'the', 'two', 'sides', 'were', 'last', 'held', 'in', 'February', '.', '"', 'The', 'voices', 'coming', 'out', 'of']
['are', 'bad', ',', 'not', 'good', '.', 'The', 'media', '...', 'are', 'full', 'of', 'expressions', 'and'

In [41]:
mention_ranges

[[0, 1],
 [7, 8],
 [11, 12],
 [13, 14],
 [19, 20],
 [38, 40],
 [43, 44],
 [47, 48],
 [52, 53],
 [55, 57],
 [60, 61],
 [74, 75],
 [82, 83],
 [97, 98],
 [113, 114],
 [127, 129],
 [132, 133],
 [135, 136],
 [138, 139],
 [142, 143],
 [146, 147],
 [151, 152],
 [170, 171],
 [180, 181],
 [182, 183],
 [193, 194],
 [197, 198],
 [223, 224],
 [228, 229],
 [231, 232],
 [245, 246],
 [248, 250],
 [251, 253],
 [261, 262],
 [264, 266],
 [274, 276],
 [277, 279],
 [280, 283],
 [285, 295],
 [296, 306],
 [307, 327],
 [328, 372],
 [373, 385],
 [386, 410],
 [411, 420],
 [421, 422],
 [424, 426],
 [427, 435],
 [436, 445],
 [446, 462],
 [463, 468],
 [469, 473],
 [475, 476],
 [477]]

In [29]:
check['start_idxs'][:21]

[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0]

In [30]:
check['end_idxs'][:21]

[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]

In [32]:
check['start_idxs'][21:42]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [33]:
check['end_idxs'][21:42]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

In [27]:
check['mentions']

['Israel',
 'Syria',
 'JERUSALEM',
 'Israel',
 'Syria',
 'Itamar Rabinovich',
 'Israel',
 'Washington',
 'Syria',
 'Israel Radio',
 'Damascus',
 'Syrian',
 'Syrians',
 'Washington',
 'Rabinovich',
 'Eliahu Ben-Elissar',
 'Israeli',
 'Egypt',
 'Likud',
 'Israel',
 'Syria',
 'Washington',
 'Damascus',
 'Syria',
 'Israel',
 'Israeli',
 'Damascus',
 'Syrian',
 'Assad',
 'Israel',
 'Israeli',
 'David Levy',
 'Israel Radio',
 'Israeli',
 'Benjamin Netanyahu',
 'Golan Heights',
 'Israel',
 'Syria',
 'Middle East',
 'Golan',
 'Golan',
 'Damascus',
 'Syria',
 'Israel',
 'God',
 'Israel',
 'Channel Two',
 'Damascus',
 'Israel',
 'Netanyahu',
 'Netanyahu',
 'Syria',
 'United States',
 'Moscow']

In [35]:
# # note: question here means mention in documents that have linked wikipedia page
# def extract_questions(filename):

#     # all the datapoints
#     global_questions = []

#     # left context so far in the document
#     left_context = []

#     # working datapoints for the document
#     document_questions = []

#     # is the entity open
#     open_entity = False

#     # question id in the document
#     question_i = 0

#     with open(filename) as fin:
#         lines = fin.readlines()

#         for line in tqdm(lines):

#             if "-DOCSTART-" in line:
#                 # new document is starting
#                 doc_id = line.split("(")[-1][:-2]

#                 # END DOCUMENT
#                 # check end of entity
#                 if open_entity:
#                     document_questions[-1]["input"].append(END_ENT_TOKEN)
#                     open_entity = False

#                 """
#                 #DEBUG
#                 for q in document_questions:
#                     pp.pprint(q)
#                     input("...")
#                 """

#                 # add sentence_questions to global_questions
#                 global_questions.extend(document_questions)

#                 # reset
#                 left_context = []
#                 document_questions = []
#                 question_i = 0

#             else:
#                 split = line.split("\t")
#                 token = split[0].strip()

#                 if len(split) >= 5:
#                     B_I = split[1]
#                     mention = split[2]
#                     #  YAGO2_entity = split[3]
#                     Wikipedia_URL = split[4]
#                     Wikipedia_ID = split[5].split('\n')[0]
#                     # Freee_base_id = split[6]

#                     if B_I == "I":
#                         pass
#                     elif B_I == "B":

#                         title = Wikipedia_URL.split("/")[-1].replace("_", " ")

#                         if Wikipedia_ID == "000":

#                             if Wikipedia_URL in url2id_cache:
#                                 pageid = url2id_cache[Wikipedia_URL]
#                             else:

#                                 pageid = _get_pageid_from_api(title)
#                                 url2id_cache[Wikipedia_URL] = pageid
#                             Wikipedia_ID = pageid

#                         q = {
#                             "id": "{}:{}".format(doc_id, question_i),
#                             "input": left_context.copy() + [BEGIN_ENT_TOKEN],
#                             "mention": mention,
#                             "Wikipedia_title": title,
#                             "Wikipedia_URL": Wikipedia_URL,
#                             "Wikipedia_ID": Wikipedia_ID,
#                             "left_context": left_context.copy(),
#                             "right_context": [],
#                         }
#                         document_questions.append(q)
#                         open_entity = True
#                         question_i += 1

#                     else:
#                         print("Invalid B_I {}", format(B_I))
#                         sys.exit(-1)
#                 else:
#                     if open_entity:
#                         document_questions[-1]["input"].append(END_ENT_TOKEN)
#                         open_entity = False

#                 left_context.append(token)
#                 for q in document_questions:
#                     q["input"].append(token)

#                 for q in document_questions[:-1]:
#                     q["right_context"].append(token)

#                 if len(document_questions) > 0 and not open_entity:
#                     document_questions[-1]["right_context"].append(token)

#     # FINAL SENTENCE
#     if open_entity:
#         document_questions[-1]["input"].append(END_ENT_TOKEN)
#         open_entity = False

#     # add sentence_questions to global_questions
#     global_questions.extend(document_questions)

#     return global_questions

In [6]:
# def convert_to_BLINK_format(questions):
#     data = []
#     for q in questions:
#         datapoint = {
#             "context_left": " ".join(q["left_context"]).strip(),
#             "mention": q["mention"],
#             "context_right": " ".join(q["right_context"]).strip(),
#             "query_id": q["id"],
#             "label_id": q["Wikipedia_ID"],
#             "Wikipedia_ID": q["Wikipedia_ID"],
#             "Wikipedia_URL": q["Wikipedia_URL"],
#             "Wikipedia_title": q["Wikipedia_title"],
#         }
#         data.append(datapoint)
#     return data

In [7]:
# # store on file
# def store_questions(questions, OUT_FILENAME):
# #     if not os.path.exists(os.path.dirname(OUT_FILENAME)):
# #         try:
# #             os.makedirs(os.path.dirname(OUT_FILENAME))
# #         except OSError as exc:  # Guard against race condition
# #             if exc.errno != errno.EEXIST:
# #                 raise
#     with open(OUT_FILENAME, "w+") as fout:
#         for q in questions:
#             json.dump(q, fout)
#             fout.write("\n")

In [8]:
# print("AIDA-YAGO2")
# in_aida_fname = 'AIDA-YAGO2-dataset.tsv'
# aida_questions = extract_questions(in_aida_fname)

  3%|▎         | 10191/323395 [00:00<00:03, 101852.06it/s]

AIDA-YAGO2


100%|██████████| 323395/323395 [00:02<00:00, 139971.91it/s]


In [9]:
train = []
testa = []
testb = []
for element in aida_questions:
    if "testa" in element["id"]:
        testa.append(element)
    elif "testb" in element["id"]:
        testb.append(element)
    else:
        train.append(element)
print("train: {}".format(len(train)))
print("testa: {}".format(len(testa)))
print("testb: {}".format(len(testb)))

train: 18541
testa: 4791
testb: 4485


In [10]:
train_blink = convert_to_BLINK_format(train)
testa_blink = convert_to_BLINK_format(testa)
testb_blink = convert_to_BLINK_format(testb)

In [11]:
out_train_aida_filename = "AIDA-YAGO2_train.jsonl"
store_questions(train_blink, out_train_aida_filename)
out_testa_aida_filename = "AIDA-YAGO2_testa.jsonl"
store_questions(testa_blink, out_testa_aida_filename)
out_testb_aida_filename = "AIDA-YAGO2_testb.jsonl"
store_questions(testb_blink, out_testb_aida_filename)