In [12]:
import numpy as np
from tqdm import tqdm
import os
import pickle
import re
from pathlib import Path
from collections import defaultdict
import ujson

Load alias map to filter.

In [13]:
from bootleg.symbols.entity_profle import EntityProfile
root_dir = Path("train_data_dir")
entity_dump = EntityProfile.load_from_cache(load_dir=root_dir / "entity_db")

In [15]:
curr_aliases = entity_dump._entity_symbols.get_alias2qids_dict()

Load count files for all of wikipedia --- these were computed with `compute_statistics.py` (in utils/preprocessing) over the merged data file of test, dev, and train.

In [16]:
# number of times alias phrase occurs in the text across ALL of wikipedia
alias_text_counts = ujson.load(
    open(root_dir / 'stats/alias_text_counts.json'))

# number of times alias occurs as an alias across ALL of wikipedia
alias_counts = ujson.load(
    open(root_dir / 'stats/alias_counts.json'))

Simple function to find aliases to remove based on the count files above.

In [17]:
def get_norm_value(alias, verbose=False):
    if verbose:
        print('# times occurs as alias:', alias_counts.get(alias, 0))
        print('# times occurs in text:', alias_text_counts.get(alias, 0))
    return alias_counts.get(alias, 0) / (alias_text_counts[alias]) if alias in alias_text_counts else -1


def get_aliases_to_remove(curr_aliases, keep_wikidata=False, norm_threshold=0.017, min_seen=500, min_alias_count=10000):
    """
    Remove aliases which are frequent words but infrequent aliases due to rarity 
    or mislabel (e.g. band "themselves").
    """
    aliases_to_remove = set()
    cnts = defaultdict(int)
    grps = defaultdict(list)
    for alias in tqdm(curr_aliases):
        # If alias is not seen in Wikipedia
        if alias not in alias_counts:
            # If alias is seen in text but only a few times, skip as it's too few to make a decision
            if (alias in alias_text_counts and alias_text_counts[alias] < min_seen):
                continue
            # if alias occurs in Wikidata (so it's in our alias map), but not as alias in Wikipedia
            # and occurs more than min_seen times, only keep if one candidate (indicating a fairly unique alias)
            # and if that one candidate is a type we care about (e.g., people and locations)
            elif len(curr_aliases[alias]) == 1:
                continue
            # else make sure we don't think it's a person or location name - we want to keep those
            # even if more general alias
            else:
                if keep_wikidata:
                    continue
                cnts["not_in_wikipedia"] += 1
                grps["not_in_wikipedia"].append(alias)
                aliases_to_remove.add(alias)
                continue 
        # length greater than max_alias_len and weak labels cause some aliases to occur as aliases 
        # but not occur in the text
        if alias not in alias_text_counts:
            continue 
        # filter out aliases which occur commonly in the text but uncommonly as an alias
        # we require that the alias is a common phrase in text 
        # and that the phrase isn't very commonly an alias 
        if (get_norm_value(alias) < norm_threshold):
            if alias_text_counts[alias] > min_seen:
                if alias_counts[alias] < min_alias_count:
                    aliases_to_remove.add(alias)
                    cnts["removed_filter"] += 1
                    grps["removed_filter"].append(alias)
                else:
                    cnts["grt_min_alias_cnt"] += 1
                    grps["grt_min_alias_cnt"].append(alias)
            else:
                cnts["lt_min_seen"] += 1
                grps["lt_min_seen"].append(alias)
    
    return aliases_to_remove, cnts, grps

In [9]:
aliases_to_remove, cnts, grps = get_aliases_to_remove(curr_aliases)
print(ujson.dumps(cnts, indent=4))
print(f"Will remove {len(aliases_to_remove)} out of {len(curr_aliases)}")

  0%|          | 26217/15290555 [00:00<00:58, 262167.12it/s]

Using stats to filter


100%|██████████| 15290555/15290555 [00:29<00:00, 521540.39it/s]
  0%|          | 57444/15290555 [00:00<00:26, 574436.44it/s]

FILT REM 89363
{
    "removed_filter": 45632,
    "lt_min_seen": 54010,
    "grt_min_alias_cnt": 3,
    "not_in_wikipedia": 43731
}
Using Wikidata to filter


100%|██████████| 15290555/15290555 [00:19<00:00, 784472.70it/s]


WIKI REM 8024947
Will remove 8081077 out of 15290555


Sanity checks on the filter step. 

In [20]:
# sample what aliases are getting removed
num_to_sample = 50
for alias in np.random.choice(list(aliases_to_remove), num_to_sample): 
    print(alias)

цезиас мец
10 xronia mazi
rockfunk
ustajikistan relations
ak47su
body transistor
gare dolten
henry richardson cricketer born 1846
justice william o douglas
woollcott alexander
mtv swedish tv channel
sir michael atiyah
west los angeles ca
kfar sava
hunters island
french concession of shanghai
roadshow films
2015 yale bulldogs football
black sea region turkey
henry gilroy baseball
beckham putra nugraha
korbr
anne dormer lady hungerford
saint torpes of pisa
iso 639zir
ballets by marius petipa
marine cadets
pacific northwest bell telephone company
avid d weinberger
5 x 5 cube
draftzayn africa
daphne anne caruana galizia
national bishop for torres strait people
catchment water
notre dame fighting irish football 1985
lockheed hudson iva
englishborn
united statesman
gadaræ
trygve martin bratteli
jakobstadt
albanian national liberation front
sirkesh
cuisine of boston
still standing tv series
norske skogindustrier asa
klein charles
mpeg2 layer ii
poaching of white rhinoceroses
the henegar cente

In [110]:
# check for existence of certain words in aliases_to_remove
sanity_checks = [('themselves', True), 
                 ('dolittle', False),
                 ('us', False),
                 ('s', True),
                 ('is', True),
                 ('also', True),
                 ('in a world', True), 
                 ('of', True),
                 ('the', True),
                 ('by year', True),
                 ('apoptosis', False),
                 ('england', False)]
for s, bool_val in sanity_checks: 
    assert (s in aliases_to_remove) is bool_val, f'{s} {bool_val} {s in aliases_to_remove}'

Remove aliases and save new candidate mapping.

In [None]:
print("Loading edit mode, may take some minutes")
entity_dump_edit = EntityProfile.load_from_cache(load_dir=root_dir / "entity_db", edit_mode=True)

In [18]:
for alias in tqdm(aliases_to_remove):
    for qid in list(entity_dump_edit.get_qid_cands(alias)):
        entity_dump_edit.remove_mention(qid, alias)

7206525 VS 15202497


In [19]:
new_dir = root_dir / 'entity_db_filt'
entity_dump_edit.save(new_dir)

Saved alias mapping at /dfs/scratch0/lorr1/projects/bootleg-data/data/wiki_title_0122/entity_db/entity_mappings/alias2qids_wikidata.json and id to /dfs/scratch0/lorr1/projects/bootleg-data/data/wiki_title_0122/entity_db/entity_mappings/alias2id_wikidata.json
