## Load ICIJ dataset


In [1]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

In [2]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)

In [3]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
len(doc_bin)

2

In [4]:
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

## Load example Wikidata KB (manual input)

In [13]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/icij-example/entities.jsonl")]

In [14]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/icij-example/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]

In [15]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)

In [16]:
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)

In [17]:
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker']

In [18]:
doc = scrape_nlp(docs[0])

In [19]:
displacy.render(doc, style="ent")

## Load custom KB built from Senzing results

In [17]:
import pytextrank

/Users/louis.guitton/Library/Caches/pypoetry/virtualenvs/erkg-tutorials-graphgeeks-4I9UJMJy-py3.12/lib/python3.12/site-packages


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/senzing/entities.jsonl")]
len(entities)

29

In [10]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/senzing/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]
len(aliases)

2502

In [11]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)



In [19]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)
scrape_nlp.add_pipe("textrank")
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker',
 'textrank']

In [55]:
doc = scrape_nlp(docs[0])

In [56]:
displacy.render(doc, style="ent")

In [57]:
records = []
for phrase in doc._.phrases[:30]:
    record = (phrase.text, phrase.rank, phrase.count, set((ent.text, ent.kb_id_) for chunk in phrase.chunks for ent in chunk.ents))
    records.append((record[0], record[1], record[2], [{"text": e[0], "kb_id": e[1]} for e in record[-1]]))

In [48]:
import pandas as pd

In [58]:
raw_entities = pd.DataFrame.from_records(records, columns=["phrase", "rank", "count", "entities"]).explode("entities")
df = pd.concat(  # type: ignore
    [
        raw_entities.drop(columns="entities"),
        pd.json_normalize(raw_entities.entities).set_index(raw_entities.index),  # type: ignore
    ],
    axis=1,
)

In [59]:
df

Unnamed: 0,phrase,rank,count,text,kb_id
0,Former Czech prime minister Andrej Babis,0.085349,1,Andrej Babis,960146.0
0,Former Czech prime minister Andrej Babis,0.085349,1,Czech,
1,Andrej Babis,0.078636,5,Andrej Babis,960146.0
2,Babis,0.072237,14,Babis,960146.0
3,former Czech prime minister,0.071176,1,Czech,
4,secret offshore deals,0.069871,1,,
5,Czech law,0.061685,1,Czech,
6,ICIJ,0.060946,11,ICIJ,
7,Czech,0.058808,8,Czech,
8,Former Czech leader’s secret French estate,0.057126,1,French,388148.0


In [60]:
entities_to_review = df.loc[lambda d: (d.text.notnull()) & (d.kb_id == '')]
entities_to_review

Unnamed: 0,phrase,rank,count,text,kb_id
0,Former Czech prime minister Andrej Babis,0.085349,1,Czech,
3,former Czech prime minister,0.071176,1,Czech,
5,Czech law,0.061685,1,Czech,
6,ICIJ,0.060946,11,ICIJ,
7,Czech,0.058808,8,Czech,
8,Former Czech leader’s secret French estate,0.057126,1,Czech,
9,Hungarian Prime Minister Victor Orbán,0.0569,1,Victor Orbán,
9,Hungarian Prime Minister Victor Orbán,0.0569,1,Hungarian,
12,Freedom Party leader Herbert Kickl,0.050929,1,Freedom Party,
12,Freedom Party leader Herbert Kickl,0.050929,1,Herbert Kickl,
