# Named Entity Recognition for Process Extraction with GoLLIE

## Import requirements

In [1]:
import sys
sys.path.append("../")

In [2]:
import rich
import logging
from src.model.load_model import load_model
import black
import inspect
from jinja2 import Template
import tempfile
from src.tasks.utils_typing import AnnotationList
logging.basicConfig(level=logging.INFO)
from typing import Dict, List, Type

### Load Model from HuggingFace

In [4]:
model, tokenizer = load_model(
    inference=True,
    model_weights_name_or_path="HiTZ/GoLLIE-34B",
    quantization=4,
    cache_dir="/work3/s213709",
    use_lora=False,
    force_auto_device_map=True,
    use_flash_attention=True,
    torch_dtype="bfloat16"
)

INFO:root:Loading model model from HiTZ/GoLLIE-34B
INFO:root:We will load the model using the following device map: auto and max_memory: None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:root:Bits and Bytes config: {
    "quant_method": "bitsandbytes",
    "load_in_8bit": false,
    "load_in_4bit": true,
    "llm_int8_threshold": 6.0,
    "llm_int8_skip_modules": null,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "bnb_4bit_compute_dtype": "bfloat16"
}
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

INFO:root:Model dtype: torch.bfloat16
INFO:root:Total model memory footprint: 17659.621666 MB


### Define Guidelines for the Extraction of Process Elements from Medical Guidelines 

#### Simple Annotation Schema

In [5]:
from typing import List

from src.tasks.utils_typing import Relation, dataclass

"""
Relation definitions
"""

"""The Physical Relation captures the physical location relation of entities such as: 
a Person entity located in a Facility, Location or GPE; or two entities that are near, 
but neither entity is a part of the other or located in/at the other."""

@dataclass
class ActivityActorPerformerRelation(Relation):
    """The ActivityActorPerformer Relation captures the relationship between activity and an actor entities.
    The ActivityActorPerformer relation can be for example "performed by" so "the surgery was performed by the surgeon" 
    or "administered by" so "the medication was administered by the nurse".
    The Activity can either be performed by medical staff (doctors, nurses, etc. ) or by the patient himself.
    """
    
    arg1: str
    arg2: str

@dataclass
class ActivityActorReceiverRelation(Relation):
    """The ActivityActorReceiver Relation captures the relationship between activity and an actor entities.
    The ActivityActorReceiver relation describes the relation between an activity and an actor which is the receiver of the activity.
    Most often activities are performed to the patient. 
    """
    
    arg1: str
    arg2: str


@dataclass
class ActivityActivityDataRelation(Relation):
    """The ActivityActivityData relation captures the relationship between the entities Activity and the object which is used by the Activity the ActivityData entity. 
    """
    
    arg1: str
    arg2: str


@dataclass
class ActivitySpecificationRelation(Relation):
    """The ActivitySpecification Relation captures the realtionship between the entities Activity and Specification which further describes the Activity.
    The ActivitySpecification relation can be for example "the surgery was performed at the hospital" so we have a relation between the Activity "performed" and the Specification "at the hospital".
    The type of the ActivitySpecificationRelation depends on what the Specification entity describes. 
    """
    
    arg1: str
    arg2: str
    type: str # Either Reason, State, Identification, Frequency, Duration, Location, Time, Description, Including, Additional


@dataclass
class ActivityParentRelation(Relation):
    """The ActivityParent Relation captures the relationship between two activities where one activity is the parent of the other activity.
    Or the other Activity is a subactivity of the first activity. For example "# Setting Up for a Sterile Surgical Procedure #" is the parent of "### Preparation of the Surgical Environment ###".
    So the Parent Activity is "Setting Up" and the Subactivity is "Preparation"
    """

    arg1: str
    arg2: str

@dataclass
class ActivityGuardRelation(Relation):
    """The ActivityGuard Relation captures the relationship between a Activity and a Guard entity.
    So for example a Activity can be guarded by a Guard entity. The Guard entity can be for example "> 6h" so "if the stroke happened already > 6 hours ago perform SOFA-Test".
    The Guard entity would be "> 6h" and the Activity would be "perform" and the type would be "Larger".
    """

    arg1: str
    arg2: str
    type: str # Either: Deadline, Exact, Larger, Smaller or Valid


@dataclass
class ActivityPurposeOutcomeRelation(Relation):
    """The ActivityPurposeOutcome Relation captures the relationship between an Activity and PurposeOutcome entity. 
    So the relation between an action and the reason or the goal why the action is performed.
    """

    arg1: str
    arg2: str


@dataclass
class ConnectRelation(Relation):
    """A Connect Relation captures the relationship between two activities and a AND, XOR, OR entity. 
    Examples for Connect Relations are "If the patient has sepsis, use antibiotics to cover likely pathogens and initiate aggressive fluid resuscitation to support hemodynamic stability." 
    so we would have a connection relation between the activity "use" and the activity "initiate" and the AND entity."""

    arg1: str # Activity, Observation, Input or Output Entity
    arg2: str # Activity, Observation, Input or Output Entity
    arg3: str # Either AND, OR or XOR


@dataclass
class ConditionRelation(Relation):
    """A Condition Relation captures the relationship between two activties or an observation and a activity which need to be executed in a specific order. So action B can only be executed after action A.
    Activity B could for instance be “Prescribe medicin”. For that to happen a medical examination has to take place, which could be activity A.
    The Condition Relation could be for example "use infusion fluid only in cases where the patient is dehydrated" so we would have a relation between the activity "use" and the observation "dehydration". 
    Or for example "before prescribing medication, perform a medical examination" we would have the activity "prescribe" and the activity "examination" connected with a ConditionRelation.
    """

    _from: str # Activity, Observation, Input or Output Entity 
    _to: str # Activity, Observation, Input or Output Entity
    arg3: str # Condition Entity, such as: and finally, when, followed by, after, during this period, :, before, until


@dataclass
class ResponseRelation(Relation):
    """A Response Relation captures the relationship between two activties or an observation and an activity. This relation applies if after executing the first activity the second activity must be executed.
    The Response Relation could be for example "if the patient is dehydrated, use infusion fluid" so we would have a relation between the observation "dehydration" and the activity "use".
    """

    _from: str # Activity, Observation, Input or Output Entity 
    _to: str # Activity, Observation, Input or Output Entity
    arg3: str # Response Entity, such as: can be repeated, and requires, whether, during this period, in case of, must

@dataclass
class ConditionResponseRelation(Relation):
    """A ConditionResponse Relation captures the relationship between two activties or an observation and an activity and applies if as Response Relation is combined with a Condition Relation. This relation applies if after executing the first activity the second activity must be executed and 
    it is only allowed to happen after activity A. In the following example sentence "Following the collection of blood cultures, administer broad-spectrum intravenous antibiotics within one hour of recognition of sepsis."
    we would have a ConditionResponse Relation between the activity "collection" and the activity "administer".
    """

    _from: str # Activity, Observation, Input or Output Entity
    _to: str # Activity, Observation, Input or Output Entity 
    arg3: str # Response Entity, such as: can be repeated, and requires, whether, during this period, in case of, must

@dataclass
class ExclusionRelation(Relation):
    """A RelationExclusion entity caputres the relationship between two activities or an observation and an activity where one is excluding the other. 
    Examples for the RelationExclusion entity would be for example not routinely recommmended, or should not be, if observation activity is not possible."""

    _from: str # Activity, Observation, Input or Output Entity
    _to: str # Activity, Observation, Input or Output Entity
    arg3: str # Exclusion Entity, such as: must not, not suitable, should not be, not recommended, not routinely recommended

@dataclass
class InclusionRelation(Relation):
    """A RelationInclusion entity captures the relationship between two activities or an observation and an activity where one is including the other. For example blood tests are not required unless
    observation A is true. The RelationInclusion entity can therefore be for example "unless" so "blood tests are not required unless observation A is true" 
    or "if" so "if observation A is true, then blood tests are required"."""

    _from: str # Activity, Observation, Input or Output Entity
    _to: str # Activity, Observation, Input or Output Entity
    arg3: str # Inclusion Entity, such as: unless, if



ENTITY_DEFINITIONS: List[Relation] = [
    ActivityActorPerformerRelation,
    ActivityActorReceiverRelation,
    ActivityActivityDataRelation,
    ActivitySpecificationRelation,
    ActivityParentRelation,
    ActivityGuardRelation,
    ActivityPurposeOutcomeRelation,
    ConnectRelation,
    ConditionRelation,
    ResponseRelation,
    ResponseRelation,
    ConditionResponseRelation,
    ExclusionRelation,
    InclusionRelation
]

if __name__ == "__main__":
    cell_text = In[-1]


Due to IPython limitations, we must write the content of the previous cell to a file and then import the content from that file.

In [6]:
with open("guidelines.py","w",encoding="utf8") as python_guidelines:
    print(cell_text,file=python_guidelines)

from guidelines import *

We use inspect.getsource to get the guidelines as a string

In [7]:
guidelines = [inspect.getsource(definition) for definition in ENTITY_DEFINITIONS]

### Load input sentences

In [8]:
text = "During this period, the patient should only drink tap water Non-insulin-dependent diabetes: * The patient must not take their antidiabetic medication in the morning and follow the usual guidelines, i.e. fasting for 6 hours before the start of the scan, after the scan the patient can eat and take their antidiabetic medication as usual"
gold = [
    RelationResponse(span= "During this period"),
    Actor(span= "patient"),
    Activity(span= "drink"),
    ActivityData(span= "tap water"),
    Observation(span= "Non-insulin-dependent diabetes"),
    Actor(span= "patient"),
    RelationExclusion(span= "must not"),
    Activity(span= "take"),
    ActivityData(span= "antidiabetic medication"),
    Specification(span= "in the morning"),
    Activity(span= "follow"),
    ActivityData(span= "usual guidelines"),
    Activity(span="fasting"),
    Guard(span= "for 6 hours"),
    Activity(span= "start"),
    ActivityData(span= "scan"),
    RelationCondition(span= "after"),
    Actor(span= "patient"),
    Activity(span= "scan"),
    Activity(span= "eat"),
    Activity(span= "take"),
    ActivityData(span= "antidiabetic medication"),
    Specification(span= "as usual"),
]


In [9]:
import os
print(os.getcwd())

/zhome/06/4/166098/GoLLIEProcessExtraction


#### Filling a template

In [27]:
# Read template
with open("./templates/prompt.txt", "rt") as f:
    template = Template(f.read())
# Fill the template
formated_text = template.render(guidelines=guidelines, text=text, annotations=gold, gold=gold)


### Black Code Formatter

In [28]:
black_mode = black.Mode()
formated_text = black.format_str(formated_text, mode=black_mode)

#### Print the filled and formatted template

In [29]:
rich.print(formated_text)

### Prepare model inputs

In [30]:
prompt, _ = formated_text.split("result =")
prompt = prompt + "result ="

Tokenize the input sentences

In [31]:
model_input = tokenizer(prompt, add_special_tokens=True, return_tensors="pt")

Remove the eos token from the input

In [32]:
model_input["input_ids"] = model_input["input_ids"][:, :-1]
model_input["attention_mask"] = model_input["attention_mask"][:, :-1]

## Run GoLLIE

Now we generate the predictions with GoLLIE
We use num_beams=1 and do_sample=False in our exmperiments.

In [40]:
%%time

model_ouput = model.generate(
    **model_input.to(model.device),
    max_new_tokens=128,
    do_sample=True,
    min_new_tokens=0,
    num_beams=2,
    num_return_sequences=2,
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 142.00 MiB. GPU 0 has a total capacty of 39.39 GiB of which 94.38 MiB is free. Including non-PyTorch memory, this process has 39.29 GiB memory in use. Of the allocated memory 34.38 GiB is allocated by PyTorch, and 4.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Print the results

In [34]:
for y, x in enumerate(model_ouput):
    print(f"Answer {y}")
    rich.print(tokenizer.decode(x,skip_special_tokens=True).split("result = ")[-1])

Answer 0


### Parse the output

In [35]:
result = AnnotationList.from_output(
    tokenizer.decode(model_ouput[0],skip_special_tokens=True).split("result = ")[-1],
    task_module="guidelines"
    )
rich.print(result)

## Evaluate the results
First, we define an Scorer, for Named Entity Recognition, we will use the SpanScorer class.

We need to define the valid_types for the scorer, which will be the labels that we have defined

In [36]:
from src.tasks.utils_scorer import SpanScorer

class MyEntityScorer(SpanScorer):
    """Compute the F1 score for Named Entity Recogtion Tasks"""

    valid_types: List[Type] = ENTITY_DEFINITIONS

    def __call__(self, reference: List[Entity], predictions: List[Entity]) -> Dict[str, Dict[str, float]]:
        output = super().__call__(reference, predictions)
        return {"entities": output["spans"]}

#### Initialize the scorer

In [37]:
scorer = MyEntityScorer()

#### Compute F1

In [38]:

scorer_results = scorer(reference=[gold],predictions=[result])
rich.print(scorer_results)