In [1]:
import os
from pprint import pprint
from typing import Union

import pandas as pd
from mrq.data import load_data
from mrq.logger import get_logger
from mrq.models import NERModel
from tqdm import tqdm

log = get_logger(__name__)

tqdm.pandas()
# some tokenizers require this
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Data loading

In [2]:
data = load_data("medmcqa", sample_size=100)



## Models comparison

Models to check:
- `jarvisx17/medicine-ner` - doesn't work
- `ukkendane/bert-medical-ner` - works quite well
- `samrawal/bert-large-uncased_med-ner` - too heavy and results are questionable
- `samrawal/bert-base-uncased_clinical-ner` - better than previous one, but still some broken ents (fixed with "first" strategy?)
- `reginaboateng/clinical_bert_adapter_ner_pico_for_classification_task` - adapter-transformers lib is needed, so let's skip it but it's probably ok model
- `kamalkraj/BioELECTRA-PICO` - broken tokenizer, need more time to fix

In [3]:
def test_model(
    model_name: str, data: pd.DataFrame, col="QA", aggregation_strategy="first"
) -> pd.DataFrame:
    model = NERModel(model_name, agg=aggregation_strategy)
    data["res"] = data[col].progress_map(model)
    # at least a single entity is predicted
    data_with_res = data[data["res"].map(bool)]
    log.info("Data len: {0}".format(len(data)))
    log.info("Number or records with entities: {0}".format(len(data_with_res)))
    return data_with_res


def print_some_results(
    data_with_res: pd.DataFrame, col="QA", sample: Union[int, float] = 5
):
    for i, row in data_with_res.sample(min(sample, len(data_with_res))).iterrows():
        print("Id:", i, "\n")
        print("Input text:", row[col], "\b")
        print("Entities:")
        pprint(row["res"])
        print("-" * 100)

In [4]:
model_name = "jarvisx17/medicine-ner"

test_jarvisx17 = test_model(model_name, data, col="A")

print_some_results(test_jarvisx17)

100%|██████████| 100/100 [00:18<00:00,  5.53it/s]

2023-05-27 19:29:07,361 - root - INFO - Data len: 100
2023-05-27 19:29:07,362 - root - INFO - Number or records with entities: 0





In [5]:
model_name = "ukkendane/bert-medical-ner"

test_ukkendane = test_model(model_name, data, col="A")

print_some_results(test_ukkendane, col="A")

100%|██████████| 100/100 [00:35<00:00,  2.78it/s]

2023-05-27 19:29:44,937 - root - INFO - Data len: 100
2023-05-27 19:29:44,938 - root - INFO - Number or records with entities: 99
Id: 34313 

Input text: Blow out fracture. Tear drop sign: -On Water's view radiograph, polypoid mass can be observed hanging from the floor into the maxillary antrum - Seen in the Blowout fracture of the orbit. Blowout fracture of the orbit: -The Diplopia is due to Inferior Rectus muscle entrapment. - Forced duction test is done to differentiate.
Entities:
[{'end': 17, 'start': 0, 'tag': 'problem', 'text': 'blow out fracture'},
 {'end': 33, 'start': 19, 'tag': 'problem', 'text': 'tear drop sign'},
 {'end': 44, 'start': 39, 'tag': 'test', 'text': 'water'},
 {'end': 77, 'start': 64, 'tag': 'problem', 'text': 'polypoid mass'},
 {'end': 186,
  'start': 153,
  'tag': 'problem',
  'text': 'the blowout fracture of the orbit'},
 {'end': 217,
  'start': 188,
  'tag': 'problem',
  'text': 'blowout fracture of the orbit'},
 {'end': 232, 'start': 220, 'tag': 'problem',




In [7]:
model_name = "samrawal/bert-base-uncased_clinical-ner"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# ent_decoder_dash = EntityDecoder(tokenizer=tokenizer, tag_sep="-")

test_samrawal_base = test_model(model_name, data, col="A")

print_some_results(test_samrawal_base, col="A")

Id: 149060 

Input text: Cell wall. Ref Katzung 10/e p 741 Bacitracin acts by inhibiting the synthesis of cell wall Other polypeptide antibiotics like Polymyxin B, colistin and tyrothricin act by affecting membranes
Entities:
[{'end': 44, 'start': 34, 'tag': 'treatment', 'text': 'bacitracin'},
 {'end': 90, 'start': 81, 'tag': 'treatment', 'text': 'cell wall'},
 {'end': 120,
  'start': 97,
  'tag': 'treatment',
  'text': 'polypeptide antibiotics'},
 {'end': 137, 'start': 126, 'tag': 'treatment', 'text': 'polymyxin b'},
 {'end': 147, 'start': 139, 'tag': 'treatment', 'text': 'colistin'},
 {'end': 163, 'start': 152, 'tag': 'treatment', 'text': 'tyrothricin'}]
----------------------------------------------------------------------------------------------------
Id: 141016 

Input text: Protein. Bilirubin formed by the destruction of RBCs is free or unconjugated bilirubin.It is lipid soluble and bound to albumin(protein conjugation),which prevents it's excretion by the kidneys in urine. Ref.T

In [8]:
model_name = "samrawal/bert-large-uncased_med-ner"

test_samrawal = test_model(model_name, data, col="A", aggregation_strategy="first")

print_some_results(test_samrawal, col="A")

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]

2023-05-27 19:33:24,334 - root - INFO - Data len: 100
2023-05-27 19:33:24,335 - root - INFO - Number or records with entities: 75
Id: 124291 

Input text: Valproate. Valproic acid is the drug of choice for absence seizures, myoclonic epilepsy, LG syndrome and infantile spasms. For women of child bearing age lamotrigine and levitriacetam can be a 2nd line drug. Ref: HL Sharma 3rd ed.Pg: 534
Entities:
[{'end': 9, 'start': 0, 'tag': 'm', 'text': 'valproate'},
 {'end': 24, 'start': 11, 'tag': 'm', 'text': 'valproic acid'},
 {'end': 67, 'start': 51, 'tag': 'r', 'text': 'absence seizures'},
 {'end': 87, 'start': 69, 'tag': 'r', 'text': 'myoclonic epilepsy'},
 {'end': 121, 'start': 115, 'tag': 'r', 'text': 'spasms'},
 {'end': 165, 'start': 154, 'tag': 'm', 'text': 'lamotrigine'},
 {'end': 183, 'start': 170, 'tag': 'm', 'text': 'levitriacetam'}]
----------------------------------------------------------------------------------------------------
Id: 159902 

Input text: Cervix is equal in size 


