# SpanMarker NER

SpanMarker is a framework for training powerful Named Entity Recognition models using familiar encoders such as BERT, RoBERTa and ELECTRA.

[More Information here](https://github.com/tomaarsen/SpanMarkerNER)

In [None]:
import json

with open('../../../data/llm_dataset.json') as f:
    data = json.load(f)

In [None]:
from span_marker import SpanMarkerModel

patient_dict = {}
patient_nums = [0, 15, 30, 78, 165, 276, 345, 428, 567, 735, 852, 961]

for patient_num in patient_nums:
    text = data[patient_num].strip()

    # Download from the 🤗 Hub
    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-bert-base-fewnerd-fine-super")
    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-roberta-large-fewnerd-fine-super")
    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super")

    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-roberta-large-ontonotes5")

    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-large-conll03") 
    model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-large-conll03-doc-context")
    # model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-large-conllpp-doc-context")

    # Run inference
    entities = model.predict(text)
    patient_dict[patient_num] = {
        "text": text,
        "entities": entities
    }


In [None]:
num = patient_nums[8]
print(patient_dict[num]["text"])
print(patient_dict[num]["entities"])

## With spacy integration

In [None]:
import spacy
from span_marker import SpanMarkerModel

# Load the spaCy model with the span_marker pipeline component
nlp = spacy.load("en_core_web_md", exclude=["ner"])
nlp.add_pipe("span_marker", config={"model": "tomaarsen/span-marker-roberta-large-ontonotes5"})

# Feed some text through the model to get a spacy Doc
text = "Mr. McLaughlin presents with acute bronchitis, diagnosed based on symptoms of cough and fever. His medical history includes asthma and allergies. He is a single male, 58 years old, and his address is 649 Schaden Estate Suite 18, Southampton, SO15 9UN. His NHS number is 568 968 0803. The care plan includes respiratory therapy to help manage symptoms."

doc = nlp(text)

# And look at the entities
print([(entity, entity.label_) for entity in doc.ents])

In [None]:
data[0]

In [None]:
import spacy
# spacy.cli.download('en_ner_bc5cdr_md')
# nlp = spacy.load("en_core_sci_scibert")

# Load the spaCy model with the span_marker pipeline component
nlp = spacy.load("en_core_sci_scibert", exclude=["ner"])
nlp.add_pipe("span_marker", config={"model": "tomaarsen/span-marker-roberta-large-ontonotes5"})

# Feed some text through the model to get a spacy Doc
doc = nlp(data[0])

# And look at the entities
print([(entity, entity.label_) for entity in doc.ents])