In [23]:
import spacy
from spacy_ann import AnnLinker

# Load the spaCy model from the output_dir you used from the create_index command
model_dir = "models/ann_linker/"
nlp = spacy.load(model_dir)

# The NER component of the en_core_web_md model doesn't actually recognize the aliases as entities
# so we'll add a spaCy EntityRuler component for now to extract them.
ruler=nlp.add_pipe('entity_ruler', before="ann_linker")
patterns = [{"label": "SKILL", "pattern": alias} for alias in nlp.get_pipe('ann_linker').kb.get_alias_strings()]+\
    [{'label': 'SKILL', 'pattern': 'AI2'}]
ruler.add_patterns(patterns)

In [24]:
doc = nlp("NLP is a highly researched subset of AI2 learn.")
[(e.text, e.label_, e.kb_id_) for e in doc.ents]

[('NLP', 'ORG', 'a3'), ('AI2', 'SKILL', '')]

In [26]:
import srsly
import numpy as np
entities = list(srsly.read_jsonl('data/entities.jsonl'))
natl_doc = nlp.make_doc(entities[2]['description'])
neur_doc = nlp.make_doc(entities[3]['description']) 

In [27]:
entity_encodings = np.asarray([natl_doc.vector, neur_doc.vector])
entity_norm = np.linalg.norm(entity_encodings, axis=1)
entity_norm

array([3.2457936, 2.6232092], dtype=float32)

In [28]:
sims = np.dot(entity_encodings, doc.vector.T) / (doc.vector_norm * entity_norm)
sims.argmax()

0

In [29]:
patterns = [
    {"label": "SKILL", "pattern": alias}
    for alias in nlp.get_pipe('ann_linker').kb.get_alias_strings()
]

In [30]:
print([(e.text, e.label_, e.kb_id_) for e in doc.ents])

[('NLP', 'ORG', 'a3'), ('AI2', 'SKILL', '')]


In [31]:
nlp("More text about nlpe")

More text about nlpe

In [32]:
ent = list(doc.ents)[0]