## Load ICIJ dataset


In [1]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

In [2]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
len(doc_bin)

2

In [3]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

## Load example Wikidata KB (manual input)

In [4]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/icij-example/entities.jsonl")]

In [5]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/icij-example/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]

In [6]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)



In [7]:
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)

In [8]:
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker']

In [9]:
doc = scrape_nlp(docs[0])

In [10]:
displacy.render(doc, style="ent")

## Load custom KB built from Senzing results

In [6]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/senzing/entities.jsonl")]

In [8]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/senzing/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]

In [9]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)



KeyboardInterrupt: 

In [7]:
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)

In [8]:
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker']

In [9]:
doc = scrape_nlp(docs[0])

In [10]:
displacy.render(doc, style="ent")