In [42]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
from datasets import ClassLabel, Sequence, load_dataset
import evaluate
import numpy as np
import pandas as pd
from spacy import displacy

Load pre-trained SciBERT model and tokenizer

In [43]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=3)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\albbl/.cache\huggingface\hub\models--allenai--scibert_scivocab_uncased\snapshots\24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1\config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading file vocab.txt from cache at C:\Users\albbl/.cache\huggingface\hub\models--allenai--scibert_scivocab_uncased\s

Load dataset and split dataset into training and validation sets

In [44]:
datasets = load_dataset("ade_corpus_v2", "Ade_corpus_v2_drug_ade_relation")

Found cached dataset ade_corpus_v2 (C:/Users/albbl/.cache/huggingface/datasets/ade_corpus_v2/Ade_corpus_v2_drug_ade_relation/1.0.0/940d61334dbfac6b01ac5d00286a2122608b8dc79706ee7e9206a1edb172c559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
datasets    

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 6821
    })
})

In [46]:
datasets["train"][0]

{'text': 'Intravenous azithromycin-induced ototoxicity.',
 'drug': 'azithromycin',
 'effect': 'ototoxicity',
 'indexes': {'drug': {'start_char': [12], 'end_char': [24]},
  'effect': {'start_char': [33], 'end_char': [44]}}}

In [47]:
consolidated_dataset = {}

for row in datasets["train"]:
    if row["text"] in consolidated_dataset:
        consolidated_dataset[row["text"]]["drug_indices_start"].update(row["indexes"]["drug"]["start_char"])
        consolidated_dataset[row["text"]]["drug_indices_end"].update(row["indexes"]["drug"]["end_char"])
        
    else:
        consolidated_dataset[row["text"]] = {
            "text": row["text"],
            "drug": [row["drug"]],
            # use sets because the indices can repeat for various reasons
            "drug_indices_start": set(row["indexes"]["drug"]["start_char"]),
            "drug_indices_end": set(row["indexes"]["drug"]["end_char"])
        }

df = pd.DataFrame(list(consolidated_dataset.values()))
# for this trial use small subset
df = df[:500]
df.head()

Unnamed: 0,text,drug,drug_indices_start,drug_indices_end
0,Intravenous azithromycin-induced ototoxicity.,[azithromycin],{12},{24}
1,"Immobilization, while Paget's bone disease was...",[dihydrotachysterol],{91},{109}
2,Unaccountable severe hypercalcemia in a patien...,[dihydrotachysterol],{84},{102}
3,METHODS: We report two cases of pseudoporphyri...,[naproxen],"{58, 71}","{80, 66}"
4,"Naproxen, the most common offender, has been a...",[Naproxen],{0},{8}


In [48]:
df["drug_indices_start"] = df["drug_indices_start"].apply(list).apply(sorted)
df["drug_indices_end"] = df["drug_indices_end"].apply(list).apply(sorted)
df.head()

Unnamed: 0,text,drug,drug_indices_start,drug_indices_end
0,Intravenous azithromycin-induced ototoxicity.,[azithromycin],[12],[24]
1,"Immobilization, while Paget's bone disease was...",[dihydrotachysterol],[91],[109]
2,Unaccountable severe hypercalcemia in a patien...,[dihydrotachysterol],[84],[102]
3,METHODS: We report two cases of pseudoporphyri...,[naproxen],"[58, 71]","[66, 80]"
4,"Naproxen, the most common offender, has been a...",[Naproxen],[0],[8]


In [49]:
# save to JSON to then import into Dataset object
df.to_json("dataset.jsonl", orient="records", lines=True)

cons_dataset = load_dataset("json", data_files="dataset.jsonl")
cons_dataset = cons_dataset["train"].train_test_split()
cons_dataset

Downloading and preparing dataset json/default to C:/Users/albbl/.cache/huggingface/datasets/json/default-eb20db107d0bce16/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/albbl/.cache/huggingface/datasets/json/default-eb20db107d0bce16/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'drug_indices_start', 'drug_indices_end'],
        num_rows: 375
    })
    test: Dataset({
        features: ['text', 'drug', 'drug_indices_start', 'drug_indices_end'],
        num_rows: 125
    })
})

Token Labeling

O - outside any entity we care about

B-DRUG - the beginning of a DRUG entity

I-DRUG - inside a DRUG entity

In [50]:
label_list = ['O', 'B-DRUG', 'I-DRUG']

custom_seq = Sequence(feature=ClassLabel(num_classes=3, 
                                         names=label_list,
                                         names_file=None, id=None), length=-1, id=None)

cons_dataset["train"].features["ner_tags"] = custom_seq
cons_dataset["test"].features["ner_tags"] = custom_seq

In [51]:
def generate_row_labels(row, verbose=False):
    """ Given a row from the consolidated `Ade_corpus_v2_drug_ade_relation` dataset, 
    generates BIO tags for drug and effect entities. 
    
    """

    text = row["text"]

    labels = []
    label = "O"
    prefix = ""
    
    # while iterating through tokens, increment to traverse all drug and effect spans
    drug_index = 0
    
    tokens = tokenizer(text, return_offsets_mapping=True)

    for n in range(len(tokens["input_ids"])):
        offset_start, offset_end = tokens["offset_mapping"][n]

        # should only happen for [CLS] and [SEP]
        if offset_end - offset_start == 0:
            labels.append(-100)
            continue
        
        if drug_index < len(row["drug_indices_start"]) and offset_start == row["drug_indices_start"][drug_index]:
            label = "DRUG"
            prefix = "B-"
        
        labels.append(label_list.index(f"{prefix}{label}"))
            
        if drug_index < len(row["drug_indices_end"]) and offset_end == row["drug_indices_end"][drug_index]:
            label = "O"
            prefix = ""
            drug_index += 1

        # need to transition "inside" if we just entered an entity
        if prefix == "B-":
            prefix = "I-"
    
    if verbose:
        print(f"{row}\n")
        orig = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
        for n in range(len(labels)):
            print(orig[n], labels[n])
    tokens["labels"] = labels
    
    return tokens

In [52]:
generate_row_labels(cons_dataset["train"][2], verbose=True)

{'text': 'OBJECTIVE: The aim of this paper is to describe a case of increased libido during fluvoxamine therapy.', 'drug': ['fluvoxamine'], 'drug_indices_start': [82], 'drug_indices_end': [93]}

[CLS] -100
objective 0
: 0
the 0
aim 0
of 0
this 0
paper 0
is 0
to 0
describe 0
a 0
case 0
of 0
increased 0
lib 0
##ido 0
during 0
flu 1
##vo 2
##xa 2
##mine 2
therapy 0
. 0
[SEP] -100


{'input_ids': [102, 3201, 862, 111, 2579, 131, 238, 1203, 165, 147, 3401, 106, 820, 131, 1175, 8147, 10612, 781, 1441, 9496, 14301, 22229, 2223, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 9), (9, 10), (11, 14), (15, 18), (19, 21), (22, 26), (27, 32), (33, 35), (36, 38), (39, 47), (48, 49), (50, 54), (55, 57), (58, 67), (68, 71), (71, 74), (75, 81), (82, 85), (85, 87), (87, 89), (89, 93), (94, 101), (101, 102), (0, 0)], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, -100]}

In [53]:
labeled_dataset = cons_dataset.map(generate_row_labels)

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Fine-tuning

In [54]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased"
batch_size = 16

In [55]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [56]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [57]:
metric = evaluate.load("seqeval")

In [58]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [59]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 

)

In [60]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 375
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 120
  Number of trainable parameters = 109330179


  0%|          | 0/120 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.9882, 'learning_rate': 9.916666666666668e-06, 'epoch': 0.04}
{'loss': 0.8601, 'learning_rate': 9.833333333333333e-06, 'epoch': 0.08}
{'loss': 0.6449, 'learning_rate': 9.75e-06, 'epoch': 0.12}
{'loss': 0.5542, 'learning_rate': 9.666666666666667e-06, 'epoch': 0.17}
{'loss': 0.5019, 'learning_rate': 9.583333333333335e-06, 'epoch': 0.21}
{'loss': 0.4092, 'learning_rate': 9.5e-06, 'epoch': 0.25}
{'loss': 0.4348, 'learning_rate': 9.416666666666667e-06, 'epoch': 0.29}
{'loss': 0.4177, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.33}
{'loss': 0.406, 'learning_rate': 9.250000000000001e-06, 'epoch': 0.38}
{'loss': 0.4054, 'learning_rate': 9.166666666666666e-06, 'epoch': 0.42}
{'loss': 0.3839, 'learning_rate': 9.083333333333333e-06, 'epoch': 0.46}
{'loss': 0.4031, 'learning_rate': 9e-06, 'epoch': 0.5}
{'loss': 0.343, 'learning_rate': 8.916666666666667e-06, 'epoch': 0.54}
{'loss': 0.2639, 'learning_rate': 8.833333333333334e-06, 'epoch': 0.58}
{'loss': 0.2125, 'learning_rate': 8.75

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


{'loss': 0.2328, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.12218962609767914, 'eval_precision': 0.7313432835820896, 'eval_recall': 0.7050359712230215, 'eval_f1': 0.717948717948718, 'eval_accuracy': 0.966295609152752, 'eval_runtime': 13.9843, 'eval_samples_per_second': 8.939, 'eval_steps_per_second': 0.572, 'epoch': 1.0}
{'loss': 0.1721, 'learning_rate': 7.916666666666667e-06, 'epoch': 1.04}
{'loss': 0.1019, 'learning_rate': 7.833333333333333e-06, 'epoch': 1.08}
{'loss': 0.1116, 'learning_rate': 7.75e-06, 'epoch': 1.12}
{'loss': 0.1194, 'learning_rate': 7.666666666666667e-06, 'epoch': 1.17}
{'loss': 0.1484, 'learning_rate': 7.583333333333333e-06, 'epoch': 1.21}
{'loss': 0.1403, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.25}
{'loss': 0.0915, 'learning_rate': 7.416666666666668e-06, 'epoch': 1.29}
{'loss': 0.1428, 'learning_rate': 7.333333333333333e-06, 'epoch': 1.33}
{'loss': 0.108, 'learning_rate': 7.25e-06, 'epoch': 1.38}
{'loss': 0.088, 'learning_rate': 7.166666666666667e-06, 'epoch': 1.42}
{'loss': 0.1369, 'learning_ra

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


{'loss': 0.027, 'learning_rate': 6e-06, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.05824517458677292, 'eval_precision': 0.8428571428571429, 'eval_recall': 0.8489208633093526, 'eval_f1': 0.8458781362007167, 'eval_accuracy': 0.9826839826839827, 'eval_runtime': 13.585, 'eval_samples_per_second': 9.201, 'eval_steps_per_second': 0.589, 'epoch': 2.0}
{'loss': 0.0849, 'learning_rate': 5.916666666666667e-06, 'epoch': 2.04}
{'loss': 0.0772, 'learning_rate': 5.833333333333334e-06, 'epoch': 2.08}
{'loss': 0.0722, 'learning_rate': 5.75e-06, 'epoch': 2.12}
{'loss': 0.0815, 'learning_rate': 5.666666666666667e-06, 'epoch': 2.17}
{'loss': 0.079, 'learning_rate': 5.583333333333334e-06, 'epoch': 2.21}
{'loss': 0.0432, 'learning_rate': 5.500000000000001e-06, 'epoch': 2.25}
{'loss': 0.0527, 'learning_rate': 5.416666666666667e-06, 'epoch': 2.29}
{'loss': 0.0714, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.33}
{'loss': 0.0524, 'learning_rate': 5.2500000000000006e-06, 'epoch': 2.38}
{'loss': 0.0513, 'learning_rate': 5.1666666666666675e-06, 'epoch': 2.42}
{'loss': 0.0

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


{'loss': 0.0307, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.041552767157554626, 'eval_precision': 0.8775510204081632, 'eval_recall': 0.9280575539568345, 'eval_f1': 0.9020979020979022, 'eval_accuracy': 0.987012987012987, 'eval_runtime': 14.4186, 'eval_samples_per_second': 8.669, 'eval_steps_per_second': 0.555, 'epoch': 3.0}
{'loss': 0.0432, 'learning_rate': 3.916666666666667e-06, 'epoch': 3.04}
{'loss': 0.0623, 'learning_rate': 3.833333333333334e-06, 'epoch': 3.08}
{'loss': 0.0235, 'learning_rate': 3.7500000000000005e-06, 'epoch': 3.12}
{'loss': 0.0209, 'learning_rate': 3.6666666666666666e-06, 'epoch': 3.17}
{'loss': 0.058, 'learning_rate': 3.5833333333333335e-06, 'epoch': 3.21}
{'loss': 0.026, 'learning_rate': 3.5e-06, 'epoch': 3.25}
{'loss': 0.0472, 'learning_rate': 3.416666666666667e-06, 'epoch': 3.29}
{'loss': 0.0405, 'learning_rate': 3.3333333333333333e-06, 'epoch': 3.33}
{'loss': 0.0523, 'learning_rate': 3.2500000000000002e-06, 'epoch': 3.38}
{'loss': 0.019, 'learning_rate': 3.1666666666666667e-06, 'epoch': 3.42}
{'loss': 0

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


{'loss': 0.0397, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.0}


  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.037170976400375366, 'eval_precision': 0.896551724137931, 'eval_recall': 0.935251798561151, 'eval_f1': 0.9154929577464789, 'eval_accuracy': 0.9882498453927026, 'eval_runtime': 12.1115, 'eval_samples_per_second': 10.321, 'eval_steps_per_second': 0.661, 'epoch': 4.0}
{'loss': 0.0383, 'learning_rate': 1.916666666666667e-06, 'epoch': 4.04}
{'loss': 0.0364, 'learning_rate': 1.8333333333333333e-06, 'epoch': 4.08}
{'loss': 0.0143, 'learning_rate': 1.75e-06, 'epoch': 4.12}
{'loss': 0.0187, 'learning_rate': 1.6666666666666667e-06, 'epoch': 4.17}
{'loss': 0.0333, 'learning_rate': 1.5833333333333333e-06, 'epoch': 4.21}
{'loss': 0.0391, 'learning_rate': 1.5e-06, 'epoch': 4.25}
{'loss': 0.0465, 'learning_rate': 1.4166666666666667e-06, 'epoch': 4.29}
{'loss': 0.0203, 'learning_rate': 1.3333333333333334e-06, 'epoch': 4.33}
{'loss': 0.0494, 'learning_rate': 1.25e-06, 'epoch': 4.38}
{'loss': 0.02, 'learning_rate': 1.1666666666666668e-06, 'epoch': 4.42}
{'loss': 0.0234, 'learning_rate': 1

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


{'loss': 0.0104, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/8 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.036115240305662155, 'eval_precision': 0.8904109589041096, 'eval_recall': 0.935251798561151, 'eval_f1': 0.9122807017543859, 'eval_accuracy': 0.9891774891774892, 'eval_runtime': 12.1696, 'eval_samples_per_second': 10.272, 'eval_steps_per_second': 0.657, 'epoch': 5.0}
{'train_runtime': 804.8272, 'train_samples_per_second': 2.33, 'train_steps_per_second': 0.149, 'train_loss': 0.12576109563621382, 'epoch': 5.0}


TrainOutput(global_step=120, training_loss=0.12576109563621382, metrics={'train_runtime': 804.8272, 'train_samples_per_second': 2.33, 'train_steps_per_second': 0.149, 'train_loss': 0.12576109563621382, 'epoch': 5.0})

In [61]:
predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: drug, offset_mapping, drug_indices_end, drug_indices_start, text. If drug, offset_mapping, drug_indices_end, drug_indices_start, text are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 125
  Batch size = 16


  0%|          | 0/8 [00:00<?, ?it/s]

{'DRUG': {'precision': 0.8904109589041096,
  'recall': 0.935251798561151,
  'f1': 0.9122807017543859,
  'number': 139},
 'overall_precision': 0.8904109589041096,
 'overall_recall': 0.935251798561151,
 'overall_f1': 0.9122807017543859,
 'overall_accuracy': 0.9891774891774892}

In [62]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=-1)

In [63]:
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []
    
    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)
    
    params = [{"text": sentence,
               "ents": entities,
               "title": None}]
    
    html = displacy.render(params, style="ent", manual=True, options={
        "colors": {
                   "B-DRUG": "#f08080",
                   "I-DRUG": "#f08080",
               },
    })
    

In [64]:
examples = [
    "Abortion, miscarriage or uterine hemorrhage associated with misoprostol (Cytotec), a labor-inducing drug.",
    "Addiction to many sedatives and analgesics, such as diazepam, morphine, etc.",
    "Birth defects associated with thalidomide",
    "Bleeding of the intestine associated with aspirin therapy",
    "Cardiovascular disease associated with COX-2 inhibitors (i.e. Vioxx)",
    "Deafness and kidney failure associated with gentamicin (an antibiotic)"
]

for example in examples:
    visualize_entities(example)
    print(f"{'*' * 50}\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


**************************************************



**************************************************



**************************************************



**************************************************



**************************************************



**************************************************

