In [1]:
import os
import sys
import time
import gzip
import marisa_trie

In [2]:
def build_marisa_trie_from_aliases(alias_file):
    # Read the aliases file
    with gzip.open(alias_file, 'rt', encoding='latin-1') as file:
        lines = file.readlines()[1:4813490]
    
    alias_to_id = {}
    for line in lines:
        aliases = line.strip().split('\t')
        for alias in aliases[1:]:
            alias_to_id[alias] = aliases[0]

    # Build the marisa-trie
    start_time = time.time()
    trie = marisa_trie.Trie(alias_to_id.keys())
    end_time = time.time()
    build_time = end_time - start_time

    # Get memory usage (rough estimate)
    memory_usage = sys.getsizeof(trie)

    return trie, alias_to_id, build_time, memory_usage

def get_id_from_marisa_trie(trie, alias_to_id, alias):
    start_time = time.time()
    if alias in trie:
        entity_id = alias_to_id[alias]
    else:
        entity_id = None
    end_time = time.time()
    lookup_time = end_time - start_time

    return entity_id, lookup_time

In [3]:
alias_filepath = os.path.join(os.getcwd(), '../data/wikidata5m/wikidata5m_alias.tar.gz')
alias = 'Barack Obama'

In [5]:
with gzip.open(alias_filepath, 'rt', encoding='latin-1') as file:
    lines = file.readlines()[1:4813490]

In [6]:
lines[0].split('\t')

['Q912600',
 'Straumur-BurÃ°arÃ¡s',
 'Straumur',
 'straumurâ\x80\x93burÃ°arÃ¡s investment bank',
 'straumur',
 'Straumur-BurÃ°arÃ¡s Investment Bank',
 'straumur-burÃ°arÃ¡s investment bank',
 'straumur investment bank',
 'straumur-burÃ°arÃ¡s fjÃ¡rf.banki',
 'Straumur-BurÃ°arÃ¡s FjÃ¡rf.banki',
 'straumur-burÃ°arÃ¡s',
 'Straumur Investment Bank',
 'Straumurâ\x80\x93BurÃ°arÃ¡s Investment Bank\n']

In [7]:
marisa_trie, alias_to_id_dict, marisa_build_time, marisa_memory = build_marisa_trie_from_aliases(alias_filepath)
entity_id, marisa_lookup_time = get_id_from_marisa_trie(marisa_trie, alias_to_id_dict, alias)

print('Marisa-trie build time: ', marisa_build_time)
print('Marisa-trie memory usage: ', marisa_memory, ' bytes')
print('Marisa-trie lookup time: ', marisa_lookup_time)
print('Marisa-trie entity id: ', entity_id)

Marisa-trie build time:  51.67834234237671
Marisa-trie memory usage:  32  bytes
Marisa-trie lookup time:  1.2159347534179688e-05
Marisa-trie entity id:  Q76
