## Load ICIJ dataset


In [1]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
len(doc_bin)

2

In [3]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

In [4]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/icij-example/entities.jsonl")]

In [5]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/icij-example/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]

In [6]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)



In [7]:
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)

In [8]:
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker']

In [9]:
doc = scrape_nlp(docs[0])

In [10]:
displacy.render(doc, style="ent")

## Entity linking from Senzing data


In [20]:
import pandas as pd

from src.senzing_parser import extract_senzing_results

In [16]:
entities = extract_senzing_results("data/ICIJ-entity-report-2024-06-21_12-04-57-std.json.zip")

read JSON: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1547418/1547418 [00:31<00:00, 49734.10it/s]


In [18]:
len(entities)

1547418

In [21]:
df_ent: pd.DataFrame = pd.DataFrame(
    [
        {
            "uid": entity.entity_uid,
            "name": entity.name,
            "has_ref": entity.has_ref,
        }
        for entity in entities.values()
    ]
)

In [22]:
df_ent.head()

Unnamed: 0,uid,name,has_ref
0,1,Ivan Davydzenka,True
1,2,Yauheni Vaitovich,True
2,3,Matvey Datskevich,True
3,4,Aliaksandr Harbunou,True
4,5,Yeufrasiniya Dankova,True


In [23]:
entities[1]

Entity(entity_uid=1, name='Ivan Davydzenka', num_recs=1, records={'BELARUS OO UBO.#2': 'INITIAL'}, related={517: {'ENTITY_ID': 517, 'MATCH_LEVEL': 2, 'MATCH_LEVEL_CODE': 'POSSIBLY_SAME', 'MATCH_KEY': '+NAME+ADDRESS-DOB', 'ERRULE_CODE': 'CNAME_CFF_DEXCL', 'IS_DISCLOSED': 0, 'IS_AMBIGUOUS': 0}, 664: {'ENTITY_ID': 664, 'MATCH_LEVEL': 3, 'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED', 'MATCH_KEY': '+ADDRESS-DOB', 'ERRULE_CODE': 'SFF', 'IS_DISCLOSED': 0, 'IS_AMBIGUOUS': 0}, 742: {'ENTITY_ID': 742, 'MATCH_LEVEL': 3, 'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED', 'MATCH_KEY': '+ADDRESS-DOB', 'ERRULE_CODE': 'SFF', 'IS_DISCLOSED': 0, 'IS_AMBIGUOUS': 0}, 112570: {'ENTITY_ID': 112570, 'MATCH_LEVEL': 3, 'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED', 'MATCH_KEY': '+ADDRESS-DOB', 'ERRULE_CODE': 'SFF', 'IS_DISCLOSED': 0, 'IS_AMBIGUOUS': 0}, 112760: {'ENTITY_ID': 112760, 'MATCH_LEVEL': 3, 'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED', 'MATCH_KEY': '+ADDRESS-DOB', 'ERRULE_CODE': 'SFF', 'IS_DISCLOSED': 0, 'IS_AMBIGUOUS': 0}}, has_r

In [24]:
df_rec: pd.DataFrame = pd.DataFrame(
    [
        {
            "entity_uid": entity.entity_uid,
            "record_uid": record_uid,
            "match_key": match_key,
        }
        for entity in entities.values()
        for record_uid, match_key in entity.records.items()
    ]
)

In [25]:
df_rec.head()

Unnamed: 0,entity_uid,record_uid,match_key
0,1,BELARUS OO UBO.#2,INITIAL
1,2,BELARUS OO UBO.#3,INITIAL
2,3,BELARUS OO UBO.#4,INITIAL
3,4,BELARUS OO UBO.#5,INITIAL
4,5,BELARUS OO UBO.#6,INITIAL


In [26]:
df_rel: pd.DataFrame = pd.DataFrame(
    [
        {
            "entity_uid": entity.entity_uid,
            "rel_ent": rel_ent["ENTITY_ID"],
            "ambiguous": (rel_ent["IS_AMBIGUOUS"] == 0),
            "disclosed": (rel_ent["IS_DISCLOSED"] == 0),
            "match_level": rel_ent["MATCH_LEVEL"],
            "match_level_code": rel_ent["MATCH_LEVEL_CODE"],
        }
        for entity in entities.values()
        for rel_key, rel_ent in entity.related.items()
    ]
)

In [27]:
df_rel.head()

Unnamed: 0,entity_uid,rel_ent,ambiguous,disclosed,match_level,match_level_code
0,1,517,True,True,2,POSSIBLY_SAME
1,1,664,True,True,3,POSSIBLY_RELATED
2,1,742,True,True,3,POSSIBLY_RELATED
3,1,112570,True,True,3,POSSIBLY_RELATED
4,1,112760,True,True,3,POSSIBLY_RELATED


In [29]:
df_rel.match_level_code.unique()

array(['POSSIBLY_SAME', 'POSSIBLY_RELATED', 'DISCLOSED', 'RESOLVED'],
      dtype=object)