In [1]:
import pandas as pd
import spacy
import medspacy
from medspacy.visualization import visualize_ent
from medspacy.ner import TargetRule
import scispacy
from spacy import displacy



In [2]:
# reading the data
med_notes = pd.read_csv('./data/med_notes.csv')
print(med_notes.shape)
med_notes.head()

(31, 2)


Unnamed: 0,content,word_count
0,While bismuth compounds (Pepto-Bismol) decreas...,123
1,"Diarrhea, also spelled diarrhoea, is the condi...",90
2,Antiretroviral therapy (ART) is recommended fo...,80
3,The following drugs are considered as DMARDs: ...,223
4,"The goals of treatment are to reduce pain, dec...",113


##### Loading spacy core english model
Spacy handles the preprocessing tasks i.e. tokenization, lemmatization etc. It also has pretrianed ner pipeline. 

In [3]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
# checking with some text

# text = """The Fred Hutchinson Cancer Center, formerly known as the Fred Hutchinson Cancer Research Center and 
# also known as Fred Hutch or The Hutch, is a cancer research institute established in 1975 in Seattle, Washington"""

text = """Fred Hutchinson Cancer Center was established in 1975 in Seattle, Washington"""

doc = nlp(text)

In [7]:
displacy.render(doc, style='ent')

In [84]:
# it is doing fairly well with the text
# let's check with our medical note now
doc = nlp(med_notes['content'][0] )
displacy.render(doc, style='ent')

In [85]:
# did not do well with medical note 
# this is expected, as the model has not been trained for medical terms
# let's see which labels are in the ner pipeline
ner = nlp.get_pipe('ner')
ner.labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [95]:
# these are not actually medically relevant
# we can add a few rules though 
# if the dataset is small, that will help
# medspacy can help with that
# Load medspacy model
med_nlp = medspacy.load()
print(med_nlp.pipe_names)

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']


In [88]:
# removing the pipeline as it would not be necessary now
_ = med_nlp.remove_pipe("medspacy_context")

In [89]:
doc = med_nlp(med_notes['content'][0] )
visualize_ent(doc)

In [90]:
# adding a few rules

In [91]:

target_matcher = med_nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    TargetRule("bismuth", "AGENT"),
    TargetRule("diarrhea", "MEDICAL_CONDITION"),
    TargetRule("loperamide", "AGENT"),
]
target_matcher.add(target_rules)


In [92]:
doc = med_nlp(med_notes['content'][0])
visualize_ent(doc)

In [93]:
# now test with another note
doc = med_nlp(med_notes['content'][3] )
visualize_ent(doc)


> earlier rules did not apply to this note, we might consider adding new rules.


### Using pre-trained model

> Pretrained model can be very useful as they are trained on medical  corpus on similar task. The advantage is that, we don't have to add each rule manualy. Here I am loading a pretrained model from scispacy. Information about the model can be found here: https://allenai.github.io/scispacy/

In [76]:
med_nlp = spacy.load("en_ner_bc5cdr_md")
med_nlp.component_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

In [81]:
# checking the labels in the ner pipeline
ner = med_nlp.get_pipe('ner')
ner.labels

('CHEMICAL', 'DISEASE')

In [79]:
doc = med_nlp(med_notes['content'][0])
displacy.render(doc, style='ent')

In [80]:
doc = med_nlp(med_notes['content'][3])
displacy.render(doc, style='ent')

> Using this model can be helpful if we want to identify disease and chemical. In case we want to add other labels, we have to update and train the model with new labels.