In [74]:
import requests
import spacy
from bs4 import BeautifulSoup, SoupStrainer
from spacy.tokens import DocBin
from spacy import displacy

In [69]:
from src.scraper import IcijScraper, SPACY_MODEL

## load docBin with default NER entities

In [70]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy") 

In [71]:
len(doc_bin)

2

In [80]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)

In [81]:
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

In [82]:
displacy.render(docs[0], style="ent")

## apply a different NER model on DocBin

In [72]:
import spacy
from gliner_spacy.pipeline import (  # noqa: F401 because we need to register the factory with spacy
    GlinerSpacy,
)

candidate_labels = [
    "persons",
    "address",
    "shell companies",
    "banks or law firms",
]  # NuZero requires labels to be lower-cased

model_name = "numind/NuZero_token"

nlp = spacy.load("en_core_web_md", disable=["ner"])
# nlp.add_pipe("span_marker", config={"model": "tomaarsen/span-marker-mbert-base-multinerd"})
nlp.add_pipe("gliner_spacy")
#     # config={
#     #     "gliner_model": model_name,
#     #     "chunk_size": 250,
#     #     "labels": candidate_labels,
#     #     "style": "ent",
#     #     "threshold": 0.3,
#     # },


Fetching 4 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23109.11it/s]
  state_dict = torch.load(model_file, map_location=torch.device(map_location))


<gliner_spacy.pipeline.GlinerSpacy at 0x7faf2f0a0>

In [75]:
doc = nlp(docs[0])
displacy.render(doc, style="ent")

## using a dbpedia_trie as input for the zshot entity linker

In [13]:
import spacy
import zshot
from zshot import PipelineConfig, displacy
from zshot.linker import LinkerRegen
from zshot.linker.linker_regen.utils import load_dbpedia_trie, load_wikipedia_trie
from zshot.mentions_extractor import MentionsExtractorSpacy
from zshot.utils.mappings import spans_to_dbpedia, spans_to_wikipedia

dbpedia_trie = load_dbpedia_trie()

In [None]:
nlp_dbpedia = spacy.load("en_core_web_md")
nlp_config = PipelineConfig(
    mentions_extractor=MentionsExtractorSpacy(), linker=LinkerRegen(trie=dbpedia_trie)
)
nlp_dbpedia.add_pipe("zshot", config=nlp_config, last=True)

In [15]:
doc = nlp_dbpedia(
    "CH2O2 is a chemical compound similar to Acetamide used in International Business "
    "Machines Corporation (IBM)."
)
displacy.render(doc, style="ent")
print(list(zip(doc.ents, spans_to_dbpedia(doc._.spans))))

  return self._call_impl(*args, **kwargs)


[(Acetamide, 'http://dbpedia.org/resource/Acetamide'), (International Business Machines Corporation, 'http://dbpedia.org/resource/IBM'), (IBM, 'http://dbpedia.org/resource/IBM')]


In [23]:
docs[0][:50]

Secret real estate purchases are a driving force behind the offshore economy.
No longer content with Miami condos and London townhouses, investors are pouring money into properties in all corners of the world, fueling inequality and driving up prices, Pandora Papers investigation reveals.

In [27]:
doc = nlp_dbpedia(docs[0][:100].text)

In [28]:
displacy.render(doc, style="ent")

In [None]:
doc = nlp_dbpedia(docs[0])

In [32]:
e = doc.ents[0]

In [38]:
e.label_

'Miami'

## Understanding dbpedia_map

In [40]:
from huggingface_hub import hf_hub_download
from zshot.config import MODELS_CACHE_PATH


In [77]:
REPO_ID = "ibm/regen-disambiguation"
WIKIPEDIA_MAP = "wikipedia_map_id.json"
DBPEDIA_MAP = "dbpedia_map_id.json"

dbpedia_map = hf_hub_download(repo_id=REPO_ID,
                              repo_type='model',
                              filename=DBPEDIA_MAP,
                              cache_dir=MODELS_CACHE_PATH)

In [45]:
import json

In [46]:
    with open(dbpedia_map, "r") as f:
        dbpedia_map = json.load(f)

In [50]:
spans = doc._.spans

In [51]:
links = [dbpedia_map[s.label] for s in spans if s.label in dbpedia_map]


In [54]:
extract = {k: v for k,v in dbpedia_map.items() if k in [s.label for s in spans]}

In [55]:
extract

{'Island': 'http://dbpedia.org/resource/Island',
 'London': 'http://dbpedia.org/resource/London',
 'Spain': 'http://dbpedia.org/resource/Spain',
 'The Beatles': 'http://dbpedia.org/resource/The_Beatles',
 'Miami': 'http://dbpedia.org/resource/Miami',
 'Catholic Church': 'http://dbpedia.org/resource/Catholic_Church',
 'WikiLeaks': 'http://dbpedia.org/resource/WikiLeaks',
 'Julio Iglesias': 'http://dbpedia.org/resource/Julio_Iglesias',
 'Indian Creek Village': 'http://dbpedia.org/resource/Indian_Creek_Village',
 'The Panama Papers': 'http://dbpedia.org/resource/The_Panama_Papers'}

In [52]:
links

['http://dbpedia.org/resource/Miami',
 'http://dbpedia.org/resource/London',
 'http://dbpedia.org/resource/The_Panama_Papers',
 'http://dbpedia.org/resource/WikiLeaks',
 'http://dbpedia.org/resource/Spain',
 'http://dbpedia.org/resource/Julio_Iglesias',
 'http://dbpedia.org/resource/Miami',
 'http://dbpedia.org/resource/Indian_Creek_Village',
 'http://dbpedia.org/resource/Indian_Creek_Village',
 'http://dbpedia.org/resource/Island',
 'http://dbpedia.org/resource/The_Beatles',
 'http://dbpedia.org/resource/Catholic_Church']

In [65]:
ents = list(zip(doc.ents, spans_to_dbpedia(doc._.spans)))

In [66]:
[(ent, ent.label, link) for ent,link in ents]

[(Miami, 3790800195458886901, 'http://dbpedia.org/resource/Miami'),
 (London, 5392354317538386956, 'http://dbpedia.org/resource/London'),
 (Pandora Papers,
  10110184371756829612,
  'http://dbpedia.org/resource/The_Panama_Papers'),
 (Leaked, 2688538597917308737, 'http://dbpedia.org/resource/WikiLeaks'),
 (Spanish, 17549558730983322913, 'http://dbpedia.org/resource/Spain'),
 (Julio Iglesias,
  9906224227672722029,
  'http://dbpedia.org/resource/Julio_Iglesias'),
 (Miami, 3790800195458886901, 'http://dbpedia.org/resource/Miami'),
 (Indian Creek Village,
  13920639356163037993,
  'http://dbpedia.org/resource/Indian_Creek_Village'),
 (Billionaire’s Bunker,
  13920639356163037993,
  'http://dbpedia.org/resource/Indian_Creek_Village'),
 (Island, 13646871543569613410, 'http://dbpedia.org/resource/Island'),
 (Beatle, 16563320307198691821, 'http://dbpedia.org/resource/The_Beatles'),
 (Catholic,
  1395476781778719295,
  'http://dbpedia.org/resource/Catholic_Church')]

In [76]:
dbpedia_map

{'AccessibleComputing': 'http://dbpedia.org/resource/AccessibleComputing',
 'AfghanistanHistory': 'http://dbpedia.org/resource/AfghanistanHistory',
 'AfghanistanGeography': 'http://dbpedia.org/resource/AfghanistanGeography',
 'AfghanistanPeople': 'http://dbpedia.org/resource/AfghanistanPeople',
 'AfghanistanCommunications': 'http://dbpedia.org/resource/AfghanistanCommunications',
 'AfghanistanTransportations': 'http://dbpedia.org/resource/AfghanistanTransportations',
 'AfghanistanMilitary': 'http://dbpedia.org/resource/AfghanistanMilitary',
 'AfghanistanTransnationalIssues': 'http://dbpedia.org/resource/AfghanistanTransnationalIssues',
 'AssistiveTechnology': 'http://dbpedia.org/resource/AssistiveTechnology',
 'AmoeboidTaxa': 'http://dbpedia.org/resource/AmoeboidTaxa',
 'AlbaniaPeople': 'http://dbpedia.org/resource/AlbaniaPeople',
 'AlbaniaHistory': 'http://dbpedia.org/resource/AlbaniaHistory',
 'AsWeMayThink': 'http://dbpedia.org/resource/AsWeMayThink',
 'AlbaniaEconomy': 'http://dbpe

In [59]:
len(dbpedia_trie.trie_dict)

7156

## Understanding dbpedia_trie_file

In [57]:
REPO_ID = "ibm/regen-disambiguation"
WIKIPEDIA_TRIE_FILE_NAME = "wikipedia_trie.pkl"
DBPEDIA_TRIE_FILE_NAME = "dbpedia_trie.pkl"
dbpedia_trie_file = hf_hub_download(repo_id=REPO_ID,
                                    repo_type='model',
                                    filename=DBPEDIA_TRIE_FILE_NAME,
                                    cache_dir=MODELS_CACHE_PATH)

In [58]:
dbpedia_trie_file

'/Users/louis.guitton/.cache/zshot/models--ibm--regen-disambiguation/snapshots/0cca8285d14ad119b842ba389948b36f6ada2786/dbpedia_trie.pkl'

In [61]:
import pickle

with open(dbpedia_trie_file, "rb") as f:
    dbpedia_trie = pickle.load(f)

In [68]:
dbpedia_trie.trie_dict.keys()

dict_keys([5164, 13542, 282, 736, 29316, 891, 71, 1261, 21635, 24714, 3850, 1980, 26583, 28192, 13962, 4702, 10135, 25874, 389, 9652, 16682, 797, 7588, 275, 1983, 2040, 1957, 6776, 3, 14217, 30805, 18059, 1761, 1331, 10089, 5429, 6792, 901, 432, 15001, 7153, 8513, 11375, 17690, 30794, 20114, 11801, 24224, 25835, 1533, 19111, 12503, 18802, 1371, 10582, 6157, 13050, 18240, 20875, 20474, 12316, 12821, 30523, 11551, 11957, 16943, 3826, 27777, 14984, 7608, 9318, 10007, 22077, 7491, 9640, 3054, 19014, 4066, 4821, 21329, 7389, 5085, 19850, 13640, 5331, 3312, 22239, 12611, 19248, 4588, 11154, 486, 26819, 6288, 8590, 9835, 2184, 24064, 13131, 20606, 10236, 12838, 5083, 14305, 23672, 37, 5422, 28508, 1660, 1521, 5964, 1186, 1347, 14411, 10618, 21891, 8787, 332, 5104, 5961, 24547, 961, 3671, 17946, 14630, 2051, 18251, 13661, 26815, 621, 24462, 24272, 22635, 12453, 26519, 31614, 5690, 8559, 9152, 11993, 20355, 15345, 7833, 14407, 3141, 17190, 8761, 26461, 3892, 12923, 5452, 24263, 24632, 1061, 242

In [83]:
dbpedia_trie.trie_dict[11401]

{3608: {21230: {1531: {1: {}}}},
 1: {},
 180: {8834: {1: {}}, 29: {23: {2462: {1: {}}}}},
 3: {20317: {53: {1: {}}},
  13658: {1: {}},
  10917: {1: {}},
  11039: {1: {}},
  12734: {1: {}},
  19814: {1: {}, 7: {1: {}}},
  14489: {1: {}, 1768: {262: {3913: {1: {}}}}},
  15021: {1: {}},
  208: {15021: {1: {}},
   14489: {1220: {2326: {448: {1: {}}}}, 1: {}},
   19765: {1: {}}},
  4212: {8241: {1: {}}},
  19708: {1: {}},
  19765: {1: {}}},
 17202: {21720: {1: {}}},
 209: {115: {1: {}}, 279: {1: {}}, 1: {}},
 204: {1: {}},
 6864: {1: {}, 1220: {2326: {448: {1: {}}}}, 1768: {262: {3913: {1: {}}}}},
 30438: {1: {}},
 7344: {8231: {1: {}}},
 9013: {1: {}, 7: {1: {}}},
 2305: {1: {}},
 3554: {1: {}},
 2931: {1: {}},
 10015: {1: {}, 9013: {1: {}}},
 21720: {1: {}, 7: {1: {}}},
 10958: {1: {}},
 4432: {1: {}, 7: {1: {}}},
 9679: {1: {}},
 28992: {1: {}},
 16726: {1: {}},
 3662: {1: {}},
 3669: {1: {}},
 1070: {1: {}},
 412: {6892: {1: {}}},
 748: {1: {}},
 731: {827: {1: {}, 27834: {1: {}, 7: {1

the Trie is built with this code
```python
self.trie = Trie(
    [
        self.tokenizer(e.name, return_tensors="pt")['input_ids'][0].tolist()
        for e in entities
    ]
)
```

## making microsoft ann_linker work on demo data
I can't install the package, and the package was last updated 4 years ago. I skip.

## Spacy entity linker
ref: https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson/notebooks/notebook_video.ipynb

In [3]:
import spacy
nlp = spacy.load("en_core_web_md")
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(f"Named Entity '{ent.text}' with label '{ent.label_}'")

  from .autonotebook import tqdm as notebook_tqdm


Named Entity 'Emerson' with label 'PERSON'
Named Entity 'Wimbledon' with label 'DATE'


In [4]:
import csv
from pathlib import Path

def load_entities():
    entities_loc = Path.cwd().parent / "data" / "test-spacy" / "entities.csv"  # distributed alongside this notebook

    names = dict()
    descriptions = dict()
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            names[qid] = name
            descriptions[qid] = desc
    return names, descriptions

In [6]:
name_dict, desc_dict = load_entities()
for QID in name_dict.keys():
    print(f"{QID}, name={name_dict[QID]}, desc={desc_dict[QID]}")

Q312545, name=Roy Stanley Emerson, desc=Australian tennis player
Q48226, name=Ralph Waldo Emerson, desc=American philosopher, essayist, and poet
Q215952, name=Emerson Ferreira da Rosa, desc=Brazilian footballer


In [8]:
from spacy.kb import InMemoryLookupKB
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=300)

In [None]:
# entities.jsonl
# entity_id, optional:entity_name, entity_description, corpus_frequency, optional:label
# {"id":"a6","name":"Statistics","description":"Statistics deals with all aspects of data collection, organization, analysis, interpretation, and presentation.","label":"SKILL"}

# aliases.jsonl
# name, entities, probabilities
# {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}

In [10]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here

In [11]:
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)

In [12]:
kb.add_alias(alias="Emerson", entities=name_dict.keys(), probabilities=[0.3, 0.3, 0.3])  # sum([probs]) should be <= 1 !

4831166512461469197

In [13]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['Q215952', 'Q312545', 'Q48226']
Aliases in the KB: ['Roy Stanley Emerson', 'Emerson Ferreira da Rosa', 'Ralph Waldo Emerson', 'Emerson']


In [14]:
print(f"Candidates for 'Roy Stanley Emerson': {[c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')]}")
print(f"Candidates for 'Emerson': {[c.entity_ for c in kb.get_alias_candidates('Emerson')]}")
print(f"Candidates for 'Sofie': {[c.entity_ for c in kb.get_alias_candidates('Sofie')]}")

Candidates for 'Roy Stanley Emerson': ['Q312545']
Candidates for 'Emerson': ['Q312545', 'Q48226', 'Q215952']
Candidates for 'Sofie': []


In [15]:
# change the directory and file names to whatever you like
import os
output_dir = Path.cwd().parent / "data" / "spacy_el_output"
if not os.path.exists(output_dir):
    os.mkdir(output_dir) 
kb.to_disk(output_dir / "my_kb")

In [16]:
nlp.to_disk(output_dir / "my_nlp")

In [17]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL


In [None]:
def create_kb(vocab):
    kb = InMemoryLookupKB(vocab, entity_vector_length=128)
    kb.add_entity(...)
    kb.add_alias(...)
    return kb

In [18]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
config = {
   "labels_discard": [],
   "n_sents": 1,
   "incl_prior": True,
   "incl_context": True,
   "model": DEFAULT_NEL_MODEL,
   "entity_vector_length": 300,
   "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
   "threshold": None,
}
entity_linker = nlp.add_pipe("entity_linker", config=config)

In [20]:
from spacy.ml.models import load_kb

entity_linker.set_kb(lambda vocab: load_kb(output_dir / "my_kb"))
# entity_linker.initialize(lambda: examples, nlp=nlp, kb_loader=my_kb)

In [None]:
from spacy.pipeline import EntityLinker
entity_linker = EntityLinker(nlp.vocab, DEFAULT_NEL_MODEL, name="entity_linker", cds)

In [27]:
from spacy.pipeline import EntityLinker
from spacy.kb import Candidate

# candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)

entity_linker = EntityLinker(
    nlp.vocab,
    DEFAULT_NEL_MODEL,
    entity_vector_length=300,
    get_candidates=lambda kb, span: None,
)

TypeError: EntityLinker.__init__() missing 8 required keyword-only arguments: 'labels_discard', 'n_sents', 'incl_prior', 'incl_context', 'get_candidates_batch', 'generate_empty_kb', 'use_gold_ents', and 'candidates_batch_size'

In [26]:


nlp = spacy.load("en_core_web_md")
doc = nlp("Tennis champion Emerson was expected to win Wimbledon.")
entity_linker = nlp.add_pipe("entity_linker")
entity_linker.set_kb(lambda vocab: load_kb(output_dir / "my_kb"))
# This usually happens under the hood
processed = entity_linker(doc)


AttributeError: 'function' object has no attribute 'get_candidates'

In [25]:
processed

This is a sentence.

In [21]:
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

AttributeError: 'function' object has no attribute 'get_candidates'

## implementing my own entity linker based on microsoft/spacy_ann_linker

In [1]:
from src.ann_linker.dag import entities, aliases, nlp, kb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.ann_linker.linker import AnnLinker

In [3]:
entities = entities()
aliases = aliases()
nlp = nlp()

In [4]:
kb = kb(entities, aliases)



In [5]:
kb.get_alias_candidates("ML")

[Alias(alias='ML', entities=['a1', 'a2'], probabilities=[0.5, 0.5])]

In [6]:
candidate_entities = kb.get_entity_candidates("ML")
candidate_entities

['a2', 'a1']

In [7]:
[e for e in entities if e.entity_id in candidate_entities]

[Entity(entity_id='a1', name='Machine learning (ML)', description='Machine learning (ML) is the scientific study of algorithms and statistical models...', label=None),
 Entity(entity_id='a2', name='ML ("Meta Language")', description='ML ("Meta Language") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as "Lisp with types".', label=None)]

In [8]:
doc_embedding = kb._embed("Linear regression is one of the first statistical models used by students of ML")

In [9]:
kb.disambiguate(candidate_entities, doc_embedding)

[(Entity(entity_id='a1', name='Machine learning (ML)', description='Machine learning (ML) is the scientific study of algorithms and statistical models...', label=None),
  0.4473797082901001),
 (Entity(entity_id='a2', name='ML ("Meta Language")', description='ML ("Meta Language") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as "Lisp with types".', label=None),
  0.6272382736206055)]

In [10]:
kb.get_alias_candidates("learning")

[Alias(alias='Machine learning', entities=['a1'], probabilities=[1.0])]

In [11]:
ruler = nlp.add_pipe('entity_ruler')
patterns = [
    {"label": "SKILL", "pattern": alias}
    for alias in [a.alias for a in aliases] + ['machine learn']
]
ruler.add_patterns(patterns)

In [12]:
ann_linker = nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(kb)

In [13]:
doc = nlp("NLP is a subset of machine learn.")

In [14]:
doc.ents

(NLP, machine learn)

In [15]:
kb.get_candidates_batch(doc.ents)

[[Alias(alias='NLP', entities=['a3', 'a4'], probabilities=[0.5, 0.5]),
  Alias(alias='Natural language processing', entities=['a3'], probabilities=[1.0])],
 [Alias(alias='Machine learning', entities=['a1'], probabilities=[1.0])]]

In [16]:
for ent in doc.ents:
    print(ent.kb_id_)

a3



In [17]:
kb.get_alias_candidates("machine learn")

[Alias(alias='Machine learning', entities=['a1'], probabilities=[1.0])]

In [18]:
kb.get_entity_candidates("machine learn")

['a1']

In [19]:
doc_embedding = kb._embed(doc.text)
kb.disambiguate(kb.get_entity_candidates("machine learn"), doc_embedding)

[(Entity(entity_id='a1', name='Machine learning (ML)', description='Machine learning (ML) is the scientific study of algorithms and statistical models...', label=None),
  0.5660956501960754)]

In [20]:
doc.ents[0]._.alias_candidates

[Alias(alias='NLP', entities=['a3', 'a4'], probabilities=[0.5, 0.5]),
 Alias(alias='Natural language processing', entities=['a3'], probabilities=[1.0])]

In [21]:
doc.ents[0]._.kb_candidates

[(Entity(entity_id='a3', name='Natural language processing (NLP)', description='Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.', label=None),
  0.2833211421966553),
 (Entity(entity_id='a4', name='Neuro-linguistic programming (NLP)', description='Neuro-linguistic programming (NLP) is a pseudoscientific approach to communication, personal development, and psychotherapy created by Richard Bandler and John Grinder in California, United States in the 1970s.', label=None),
  0.3160281181335449)]

In [22]:
doc.ents[1]._.alias_candidates

[Alias(alias='Machine learning', entities=['a1'], probabilities=[1.0])]

In [23]:
doc.ents[1]._.kb_candidates

[(Entity(entity_id='a1', name='Machine learning (ML)', description='Machine learning (ML) is the scientific study of algorithms and statistical models...', label=None),
  0.5660956501960754)]

In [32]:
scanner = tbl._dataset.scanner(columns=["alias.alias"])

In [34]:
unique_labels = set()
for batch in scanner.to_batches():
    unique_labels.update(batch.column("alias.alias").to_pylist())

len(unique_labels)

In [11]:
alias_records

[{'alias': 'Ivan Davydzenka', 'entities': 1, 'probabilities': 1},
 {'alias': 'Yauheni Vaitovich', 'entities': 2, 'probabilities': 1},
 {'alias': 'Matvey Datskevich', 'entities': 3, 'probabilities': 1},
 {'alias': 'Aliaksandr Harbunou', 'entities': 4, 'probabilities': 1},
 {'alias': 'Yeufrasiniya Dankova', 'entities': 5, 'probabilities': 1},
 {'alias': 'Mikalai Adzintsou', 'entities': 6, 'probabilities': 1},
 {'alias': 'Maryia Skvarcheuskaya', 'entities': 7, 'probabilities': 1},
 {'alias': 'Katsiaryna Fralova', 'entities': 8, 'probabilities': 1},
 {'alias': 'Alexander Medved', 'entities': 9, 'probabilities': 1},
 {'alias': 'Alexander Medved', 'entities': 9, 'probabilities': 1},
 {'alias': 'Ozgur Yalcinkaya', 'entities': 10, 'probabilities': 1}]

In [14]:
import json

In [29]:
import pandas as pd

In [32]:
import numpy as np

In [29]:
class AliasRawData(TypedDict):
    alias: str
    entity: int


def load_aliases(
    icij_path: str | pathlib.Path = "data/ICIJ-entity-report-2024-06-21_12-04-57-std.json",
) -> list[AliasRawData]:
    alias_records: list[AliasRawData] = []

    with open(icij_path, "r", encoding="utf-8") as fp:
        while line := fp.readline():
            dat = json.loads(line.strip())

            # add aliases from resolved entities
            entity: dict = dat["RESOLVED_ENTITY"]
            if not entity["ENTITY_NAME"]:
                continue
            for record in entity["RECORDS"]:
                alias_records.append(
                    {"alias": entity["ENTITY_NAME"], "entity": record["INTERNAL_ID"]}
                )

            # add aliases from related entities
            related_entities: dict = dat["RELATED_ENTITIES"]
            for record in related_entities:
                # MATCH_LEVEL_CODE is either POSSIBLY_SAME or POSSIBLY_RELATED or RESOLVED or DISCLOSED
                # we choose to add an alias record if POSSIBLY_SAME
                if record["MATCH_LEVEL_CODE"] in ["POSSIBLY_SAME", "RESOLVED", "DISCLOSED"]:
                    alias_records.append(
                        {"alias": entity["ENTITY_NAME"], "entity": record["ENTITY_ID"]}
                    )
                # and discard if POSSIBLY_RELATED
                elif record["MATCH_LEVEL_CODE"] == "POSSIBLY_RELATED":
                    continue

    return alias_records

def generate_aliases(raw_aliases: list[AliasRawData]) -> pd.DataFrame:
    df = (
        pd.DataFrame.from_records(raw_aliases)
        .groupby("alias")
        .agg(counts=("entity", Counter))
        .assign(entities=lambda d: d.counts.apply(list))
        .assign(
            probabilites=lambda d: d.counts.apply(
                lambda x: [count / x.total() for k, count in x.items()]
            )
        )
        .drop(columns="counts")
        .reset_index()
    )
    return df


def write_aliases(
    aliases: pd.DataFrame, filepath: str | pathlib.Path = "data/senzing/aliases.jsonl"
):
    aliases.to_json(filepath, orient="records", lines=True)

In [30]:
raw_aliases = load_aliases()

In [31]:
aliases = generate_aliases(raw_aliases)

In [32]:
aliases.head()

Unnamed: 0,alias,entities,probabilites
0,"""A T L I "" ARUBAS TRAINING AND LANGUAGE INSTITUTE","[227026, 505055, 1038409, 1186466, 1391404]","[0.2857142857142857, 0.14285714285714285, 0.28..."
1,"""A"" COMPANY INTERNATIONAL EXPORT INC","[745555, 498873, 1041481, 1185934, 1321928, 14...","[0.16666666666666666, 0.16666666666666666, 0.1..."
2,"""ACRYLIC PRODUCTS LIMITED""",[1050403],[1.0]
3,"""AEROCOSTA"" EXPRESS","[646300, 504170, 1311910, 1585724]","[0.25, 0.25, 0.25, 0.25]"
4,"""AHI""AIRPORT HOSPITALITY INDUSTRY N V","[223104, 1279980]","[0.5, 0.5]"


In [12]:
aliases.head()

Unnamed: 0,alias,entities,probabilites
0,"""A T L I "" ARUBAS TRAINING AND LANGUAGE INSTITUTE","[227026, 1038409]","[0.5, 0.5]"
1,"""A"" COMPANY INTERNATIONAL EXPORT INC",[745555],[1.0]
2,"""ACRYLIC PRODUCTS LIMITED""",[1050403],[1.0]
3,"""AEROCOSTA"" EXPRESS",[646300],[1.0]
4,"""AHI""AIRPORT HOSPITALITY INDUSTRY N V",[223104],[1.0]


In [33]:
write_aliases(aliases)

## Cherry pick the entities 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

In [3]:
from src.senzing_pipeline import load_aliases, load_countries, load_entities

Pipeline is:
- take docbin of articles
- collect NER and noun chunks
- pre-filter senzing results for that
- build entity summaries and aliases for that pre-filtered set
- load the spacy model for EL and do EL

In [4]:
countries = load_countries()
raw_entities = load_entities()

[32m2024-10-13 12:27:08.346[0m | [1mINFO    [0m | [36msrc.senzing_pipeline[0m:[36mload_countries[0m:[36m24[0m - [1mLoading country codes from data/senzing/country.tsv[0m
[32m2024-10-13 12:27:08.348[0m | [1mINFO    [0m | [36msrc.senzing_pipeline[0m:[36mload_entities[0m:[36m59[0m - [1mParsing Senzing results: data/ICIJ-entity-report-2024-06-21_12-04-57-std.json[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1547418/1547418 [00:34<00:00, 45043.62it/s]


In [7]:
raw_aliases = load_aliases()

[32m2024-10-13 12:29:58.771[0m | [1mINFO    [0m | [36msrc.senzing_pipeline[0m:[36mload_aliases[0m:[36m223[0m - [1mParsing Senzing results: data/ICIJ-entity-report-2024-06-21_12-04-57-std.json[0m
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 1547265/1547418 [00:31<00:00, 49192.01it/s]


In [8]:
set(a["type"] for a in raw_aliases)

{'ORG', 'PER'}

In [9]:
from src.senzing_pipeline import generate_patterns

In [10]:
patterns = generate_patterns(raw_aliases)

In [11]:
nlp = spacy.load(SPACY_MODEL, exclude=["ner"])

In [9]:
# disabled = nlp.select_pipes(disable="ner")
# doc = nlp("I won't have named entities")
# disabled.restore()

In [13]:
ruler = nlp.add_pipe("entity_ruler")
with nlp.select_pipes(enable="tagger"):
    ruler.add_patterns(patterns)

In [14]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler']

In [15]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

In [25]:
matched = set(ent.text for doc in nlp.pipe(docs) for ent in doc.ents)

- use aliases.jsonl to define initial group
- filter aliases.jsonl for entities in group
- filter entities.jsonl for entities in this group and friend of and friend of friend

In [18]:
displacy.render(docs[1], style="ent")

In [19]:
[p for p in patterns if p['id'] in set(ent.ent_id_ for doc in nlp.pipe(docs) for ent in doc.ents)]

KeyboardInterrupt: 

In [29]:
matched_ids = set(p['id'] for p in patterns if p['pattern'] in matched)

In [33]:
filtered_entities = {k: v for k, v in raw_entities.items() if str(k) in matched_ids}

In [37]:
filtered_aliases = [alias for alias in raw_aliases if str(alias["entity"]) in matched_ids]

In [38]:
filtered_aliases

[{'alias': 'Circle Trust Limited as Trustee of the Intrepid Settlement',
  'entity': 1470056,
  'type': 'ORG'},
 {'alias': 'Heydar Aliyev', 'entity': 918573, 'type': 'PER'},
 {'alias': 'LEYLA ILHAM QIZI ALIYEVA', 'entity': 1722271, 'type': 'PER'},
 {'alias': 'Arzu Aliyeva', 'entity': 281073, 'type': 'PER'},
 {'alias': 'Arzu Aliyeva', 'entity': 918573, 'type': 'PER'},
 {'alias': 'Arzu Aliyeva', 'entity': 1470056, 'type': 'PER'},
 {'alias': 'Arzu Aliyeva', 'entity': 1722271, 'type': 'PER'},
 {'alias': 'MOSSACK FONSECA & CO U K LIMITED',
  'entity': 918573,
  'type': 'ORG'},
 {'alias': 'Mehriban Aliyeva', 'entity': 918573, 'type': 'PER'},
 {'alias': 'Andrej Babis', 'entity': 565180, 'type': 'PER'},
 {'alias': 'Andrej Babis', 'entity': 609907, 'type': 'PER'},
 {'alias': 'Andrej Babis', 'entity': 960146, 'type': 'PER'},
 {'alias': 'Andrej Babis', 'entity': 1644643, 'type': 'PER'},
 {'alias': 'Leyla Aliyeva', 'entity': 918573, 'type': 'PER'},
 {'alias': 'Leyla Aliyeva', 'entity': 1470056, 't

In [22]:
raw_entities[918573]

{<EntityFeature.ADDRESS: 'ADDRESS'>: 'MOSSACK FONSECA & CO. (U.K.) LIMITED INVISION HOUSE WILBURY WAY, HITCHIN HERTFORDSHIRE SG4 OTW UNITED KINGDOM',
 <EntityFeature.COUNTRY_OF_ASSOCIATION: 'COUNTRY_OF_ASSOCIATION'>: 'GBR',
 <EntityFeature.NAME: 'NAME'>: 'UF UNIVERSE FOUNDATION',
 <EntityFeature.RECORD_TYPE: 'RECORD_TYPE'>: 'ORGANIZATION',
 <EntityFeature.REL_ANCHOR: 'REL_ANCHOR'>: 'ICIJ_ID 10162180'}

In [20]:
len(patterns)

4949263