In [2]:
import os
import sys
import time
import gzip
import json
import marisa_trie

In [6]:
# inspect entity mentions for the corpus to find a good cutoff
with gzip.open('../data/wikidata5m/wikidata5m_entity_mentions.json.gz', 'rt') as f:
    mentions = json.load(f)
    sorted_mentions = sorted(mentions.items(), key=lambda x: x[1], reverse=True)
    print(sorted_mentions[:100])

[('Q1142885', 4618643), ('Q10802388', 3971598), ('Q668', 3888098), ('Q577', 2441043), ('Q678', 2256476), ('Q189', 1218531), ('Q16641', 1117499), ('Q9288', 971308), ('Q184', 925012), ('Q1904', 848437), ('Q1196074', 794585), ('Q40', 728232), ('Q8765', 515346), ('Q38', 422724), ('Q7949021', 315755), ('Q6091500', 282013), ('Q824', 275737), ('Q7783619', 266394), ('Q444353', 257491), ('Q31', 237433), ('Q199', 231632), ('Q1153959', 230557), ('Q1954619', 227307), ('Q200', 207309), ('Q5727902', 181795), ('Q1270787', 176861), ('Q1059536', 164594), ('Q9081', 148385), ('Q1881711', 145854), ('Q18035109', 142465), ('Q1046315', 139058), ('Q16158056', 137625), ('Q10853543', 137058), ('Q13219273', 134932), ('Q3918', 120050), ('Q30', 118727), ('Q4', 115345), ('Q1185341', 109646), ('Q7560553', 101912), ('Q201', 100929), ('Q16', 99917), ('Q163838', 96846), ('Q7993606', 96802), ('Q183', 95400), ('Q2037045', 95303), ('Q510472', 90053), ('Q7704489', 89641), ('Q203872', 88772), ('Q82799', 85800), ('Q43483', 8

In [None]:
# count number of documents in the corpus
corpus_path = '../data/wikidata5m/wikidata5m_text.txt.gz'

num_docs = 0
num_tokens = 0
num_long_docs = 0
min_doc_len = 128

with gzip.open(corpus_path, 'rt') as f:
    for line in f:
        tokens_per_doc = len(line.strip().split('\t')[1].split())
        num_docs += 1
        num_tokens += tokens_per_doc
        if tokens_per_doc > min_doc_len:
            num_long_docs += 1
            
print('Number of documents: {:,}'.format(num_docs))
print('Number of tokens: {:,}'.format(num_tokens))
print('Number of documents above {:,} tokens: {:,}'.format(min_doc_len, num_long_docs))

In [9]:
with gzip.open('../data/wikidata5m/wikidata5m_alias.tar.gz', 'rt', encoding='latin-1') as file:
    lines = file.readlines()[1:4813490]
    for line in lines:
        parts = line.strip().split('\t')
        entity_id, aliases = parts[0], parts[1:]
        if len(aliases) == 0:
            print(entity_id)

In [2]:
def build_marisa_trie_from_aliases(alias_file):
    # Read the aliases file
    with gzip.open(alias_file, 'rt', encoding='latin-1') as file:
        lines = file.readlines()[1:4813490]
    
    alias_to_id = {}
    for line in lines:
        aliases = line.strip().split('\t')
        for alias in aliases[1:]:
            alias_to_id[alias] = aliases[0]

    # Build the marisa-trie
    start_time = time.time()
    trie = marisa_trie.Trie(alias_to_id.keys())
    end_time = time.time()
    build_time = end_time - start_time

    # Get memory usage (rough estimate)
    memory_usage = sys.getsizeof(trie)

    return trie, alias_to_id, build_time, memory_usage

def get_id_from_marisa_trie(trie, alias_to_id, alias):
    start_time = time.time()
    if alias in trie:
        entity_id = alias_to_id[alias]
    else:
        entity_id = None
    end_time = time.time()
    lookup_time = end_time - start_time

    return entity_id, lookup_time

In [3]:
alias_filepath = os.path.join(os.getcwd(), '../data/wikidata5m/wikidata5m_alias.tar.gz')
alias = 'Barack Obama'

In [5]:
with gzip.open(alias_filepath, 'rt', encoding='latin-1') as file:
    lines = file.readlines()[1:4813490]

In [6]:
lines[0].split('\t')

['Q912600',
 'Straumur-BurÃ°arÃ¡s',
 'Straumur',
 'straumurâ\x80\x93burÃ°arÃ¡s investment bank',
 'straumur',
 'Straumur-BurÃ°arÃ¡s Investment Bank',
 'straumur-burÃ°arÃ¡s investment bank',
 'straumur investment bank',
 'straumur-burÃ°arÃ¡s fjÃ¡rf.banki',
 'Straumur-BurÃ°arÃ¡s FjÃ¡rf.banki',
 'straumur-burÃ°arÃ¡s',
 'Straumur Investment Bank',
 'Straumurâ\x80\x93BurÃ°arÃ¡s Investment Bank\n']

In [7]:
marisa_trie, alias_to_id_dict, marisa_build_time, marisa_memory = build_marisa_trie_from_aliases(alias_filepath)
entity_id, marisa_lookup_time = get_id_from_marisa_trie(marisa_trie, alias_to_id_dict, alias)

print('Marisa-trie build time: ', marisa_build_time)
print('Marisa-trie memory usage: ', marisa_memory, ' bytes')
print('Marisa-trie lookup time: ', marisa_lookup_time)
print('Marisa-trie entity id: ', entity_id)

Marisa-trie build time:  51.67834234237671
Marisa-trie memory usage:  32  bytes
Marisa-trie lookup time:  1.2159347534179688e-05
Marisa-trie entity id:  Q76
