In [1]:
##############################################
## Import dependencies
##############################################
import spacy
import medspacy
import pandas as pd

from medspacy.visualization import visualize_dep, visualize_ent

from medspacy.context import ConTextRule, ConTextComponent
from medspacy.ner import TargetRule

In [2]:
from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


In [3]:
##############################################
## Instantiate MedSpacy NLP pipeline
##############################################

In [4]:
## Some example housing/homelessness data which comes from Chapman et al
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8608249/

#texts = ['The veteran is doing well in her new apartment.',
  #      'He is current on the rent.',
  #      'Veteran admitted from the streets.',
  #      'The patient is currently literally homeless.',
  #      'Spent last night at the Mission.',
  #      'Got a bed at a shelter downtown.',
  #      'His mother let him stay with her.',
  #      'She crashed at a friend’s house.',
  #      'Cannot pay the upcoming rent.',
  #      'Got an eviction notice.']

# texts

In [5]:
## Some simpler text documents meant to illustrate diversity of context rules which can be written re: housing/homelessness
texts = [
    "the patient is homeless",
    "my brother is homeless",
    "I used to be homeless",
    "I suspect the patient is homeless",
    "the patient is not homeless"
]

In [6]:
texts

['the patient is homeless',
 'my brother is homeless',
 'I used to be homeless',
 'I suspect the patient is homeless',
 'the patient is not homeless']

In [7]:
##
## One of the pre-loaded spacy (not medspacy yet) opinionated NLP pipelines
##
nlp_spacy = spacy.load('en_core_web_sm')

In [8]:
nlp_spacy.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
## Run the docs through the pipeline
docs_spacy = nlp_spacy.pipe(texts) 

In [10]:
##
## Some helper functions below which will transform spacy NLP pipeline outputs to pandas data.frame
## https://towardsdatascience.com/structured-natural-language-processing-with-pandas-and-spacy-7089e66d2b10
##

In [11]:
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    """Extract tokens and metadata from individual spaCy doc."""
    return [
        (i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
         i.dep_, i.pos_, i.is_stop, i.is_alpha, 
         i.is_digit, i.is_punct) for i in doc
    ]

In [12]:
def tidy_tokens(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "lemma", 
        "ent_type", "tag", "dep", "pos", "is_stop", 
        "is_alpha", "is_digit", "is_punct"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)

In [13]:
## Spacy processed documents to pandas dataframe using UDFs from above
tidy_tokens(docs=docs_spacy)

Unnamed: 0,doc_id,token,token_order,lemma,ent_type,tag,dep,pos,is_stop,is_alpha,is_digit,is_punct
0,0,the,0,the,,DT,det,DET,True,True,False,False
1,0,patient,1,patient,,NN,nsubj,NOUN,False,True,False,False
2,0,is,2,be,,VBZ,ROOT,AUX,True,True,False,False
3,0,homeless,3,homeless,,JJ,acomp,ADJ,False,True,False,False
0,1,my,0,my,,PRP$,poss,PRON,True,True,False,False
1,1,brother,1,brother,,NN,nsubj,NOUN,False,True,False,False
2,1,is,2,be,,VBZ,ROOT,AUX,True,True,False,False
3,1,homeless,3,homeless,,JJ,acomp,ADJ,False,True,False,False
0,2,I,0,I,,PRP,nsubj,PRON,True,True,False,False
1,2,used,1,use,,VBD,ROOT,VERB,True,True,False,False


In [14]:
###########################################
## Below we apply lightweight medspacy NLP pipeline to simple housing/homelessness texts defined above
## Two components of interest:
##    1) Identify housing/homelessness mentions in free text
##    2) Determine if the above housing/homelessness mention occurs in the presence of a contextual modifier
###########################################

In [15]:
nlp_medspacy = medspacy.load()

In [16]:
nlp_medspacy.pipe_names

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']

In [17]:
###########################################
## Define target entities to match with pattern searchers
## 
## WARNING: Spacy/MedSpacy match on TOKENS and NOT string patterns per se (this has pros and cons depending on goals)
##          Read docs: https://spacy.io/usage/rule-based-matching
##          Test API: https://explosion.ai/demos/matcher
###########################################

In [18]:
##
## Good examples: See examples of how to write target matching, and context matching code at:
##
## https://github.com/abchapman93/VA_COVID-19_NLP_BSV/tree/master/cov_bsv/knowledge_base
## https://github.com/abchapman93/ReHouSED/tree/main/rehoused_nlp/resources
##

In [19]:
## Instantiatee target matcher
target_matcher = nlp_medspacy.get_pipe("medspacy_target_matcher")

In [20]:
## Add target rules to the pipeline, to extract entities from note above
target_rules_housing = [
    ## String literal searchers
    TargetRule(literal="homeless", category="HOMELESS")
]

In [21]:
## Add target rules defined above to the entity matcher
target_matcher.add(target_rules_housing)

In [22]:
#########################################
## Add context rules to medspacy NLP pipeline
#########################################

In [23]:
context_rules = [
    ConTextRule("not", "NEGATED_EXISTENCE"),
    ConTextRule("brother", "OTHER_EXPERIENCER"),
    ConTextRule("suspect", "HYPOTHETICAL MENTION", pattern=[{"LOWER": "suspect"}]),
    ConTextRule("used to be", "HISTORICAL_MENTION")    
]

In [24]:
## Clear off-the-shelf context rules from pipeline
context = ConTextComponent(nlp_medspacy, rules=None)

In [25]:
## Add out custom context rules defined above/below
context.add(context_rules)

In [26]:
## Once loaded we can print the rules as follows
context.rules

[ConTextRule(literal='not', category='NEGATED_EXISTENCE', pattern=None, direction='BIDIRECTIONAL'),
 ConTextRule(literal='brother', category='OTHER_EXPERIENCER', pattern=None, direction='BIDIRECTIONAL'),
 ConTextRule(literal='suspect', category='HYPOTHETICAL MENTION', pattern=[{'LOWER': 'suspect'}], direction='BIDIRECTIONAL'),
 ConTextRule(literal='used to be', category='HISTORICAL_MENTION', pattern=None, direction='BIDIRECTIONAL')]

In [27]:
#len(context.rules)

In [28]:
## And we can inspect the unique set of context modifier categories
context.categories

{'HISTORICAL_MENTION',
 'HYPOTHETICAL MENTION',
 'NEGATED_EXISTENCE',
 'OTHER_EXPERIENCER'}

In [29]:
##
## Run the medspacy NLP pipeline over the texts
##
docs = list(nlp_medspacy.pipe(texts))

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [30]:
##
## Now for each processed document, further run our custom contextual analyzer over the documents
##
for doc in docs:
    context(doc)

In [31]:
##
## Visualize (potential) dependencies between tagged entities ("homeless" string) and contextual modifiers
##
visualize_dep(docs[0])

  from IPython.core.display import display, HTML


In [32]:
visualize_dep(docs[1])

  from IPython.core.display import display, HTML


In [33]:
visualize_dep(docs[2])

  from IPython.core.display import display, HTML


In [34]:
visualize_dep(docs[3])

  from IPython.core.display import display, HTML


In [35]:
visualize_dep(docs[4])

  from IPython.core.display import display, HTML


In [36]:
#########################################
## Investigate "context graph" attributes associated with each medspacy processed document
##
## We can use this information to make "context informed" document level classifications about homelessness/housing
##
## If context_modifier=NONE and homeless_tag=TRUE:
##     then homeless
## Else
##     not homeless
##
#########################################

In [37]:
for doc in docs:
    print(doc)
    print(doc._.context_graph)
    print(doc._.context_graph.modifiers)
    print(doc._.context_graph.edges)
    print(doc.ents)
    print("\n")

the patient is homeless
<ConTextGraph> with 1 targets and 0 modifiers
[]
[]
(homeless,)


my brother is homeless
<ConTextGraph> with 1 targets and 1 modifiers
[<ConTextModifier> [brother, OTHER_EXPERIENCER]]
[(homeless, <ConTextModifier> [brother, OTHER_EXPERIENCER])]
(homeless,)


I used to be homeless
<ConTextGraph> with 1 targets and 1 modifiers
[<ConTextModifier> [used to be, HISTORICAL_MENTION]]
[(homeless, <ConTextModifier> [used to be, HISTORICAL_MENTION])]
(homeless,)


I suspect the patient is homeless
<ConTextGraph> with 1 targets and 1 modifiers
[<ConTextModifier> [suspect, HYPOTHETICAL MENTION]]
[(homeless, <ConTextModifier> [suspect, HYPOTHETICAL MENTION])]
(homeless,)


the patient is not homeless
<ConTextGraph> with 1 targets and 1 modifiers
[<ConTextModifier> [not, NEGATED_EXISTENCE]]
[(homeless, <ConTextModifier> [not, NEGATED_EXISTENCE])]
(homeless,)




In [38]:
for doc in docs:
    print(doc)
    for ent in doc.ents:
        print("\t"+str(ent))
        for mod in ent._.modifiers:
            print("\t\t"+str(mod))

the patient is homeless
	homeless
my brother is homeless
	homeless
		<ConTextModifier> [brother, FAMILY]
		<ConTextModifier> [brother, OTHER_EXPERIENCER]
I used to be homeless
	homeless
		<ConTextModifier> [used to be, HISTORICAL_MENTION]
I suspect the patient is homeless
	homeless
		<ConTextModifier> [suspect, HYPOTHETICAL MENTION]
the patient is not homeless
	homeless
		<ConTextModifier> [patient is not, NEGATED_EXISTENCE]
		<ConTextModifier> [not, NEGATED_EXISTENCE]


In [39]:
#####################
## Document level classification function
#####################

In [40]:
##
## This is a bit of a hack...likely need to consider better logic around context modifiers, but this seems to work for now.
##
def doc_level_homeless(docs):
    out = []
    for doc in docs:
        if "homeless" not in str(doc.ents):
            out.append(False)
        elif "homeless" in str(doc.ents) and doc._.context_graph.edges != []:
            out.append(False)
        elif "homeless" in str(doc.ents) and doc._.context_graph.edges == []:
            out.append(True)
    return(out)


In [41]:
## Apply UDF over medspacy processed document collection
doc_level_homeless(docs=docs)

[True, False, False, False, False]

In [42]:
texts

['the patient is homeless',
 'my brother is homeless',
 'I used to be homeless',
 'I suspect the patient is homeless',
 'the patient is not homeless']