### automated gold-standard answer extraction example

In [1]:
import json
import re
from itertools import chain

# load the IDU keywords
keywords_dict = json.load(open("IDUkeywordsList.json"))
keywords_list = list(chain.from_iterable(keywords_dict.values()))

In [14]:
# note = <CLEANED CLINICAL NOTE>
# note_id = <NOTE ID TO KEEP TRACK>

dummy_note = "<more info> pt injects iv heroin daily. has a remote history of iv cocaine, last used 10 years ago. <more info>"
dummy_note_id = "xxxxx"

import re

def get_sentences_with_kw(note, note_id, kw_list):
    
    sent_kw = []

    sents = note.split('.')

    for i, sent in enumerate(sents):
        if any(re.search(kw, sent) for kw in kw_list):
            sent_kw.append(sent)
            
    if sent_kw:
        return sent_kw
    

In [15]:
keyword_ans_dict = {dummy_note_id: get_sentences_with_kw(dummy_note, dummy_note_id, keywords_list)}
keyword_ans_dict

{'xxxxx': ['<more info> pt injects iv heroin daily',
  ' has a remote history of iv cocaine, last used 10 years ago']}

### example of parsing rules to clean the answers

In [16]:
negation_patterns = ['denying', 'denies', 'denied', 'no', 'never']
temporal_patterns = ['past medical history','past medical hx','past medical h/o',
                      'remote history','remote hx','remote h/o',
                      'distant history', 'distant hx','distant h/o',
                      'prior history','prior hx','prior h/o',
                      'previous history','previous hx','previous h/o',
                      'former history','former hx','former h/o',
                      'active history', 'active hx','active h/o',
                      'current history','current hx','current h/o',
                      'recent history','recent hx','recent h/o',
                      'last history','last hx', 'last h/o',
                      'long history', 'long hx', 'long h/o',
                      'daily', 'occasional', 'regularly',
                      'often', 'sometimes', 'frequently', 
                      'past', 'remote', 'distant', 'prior',
                      'previous', 'former', 'active', 'current',
                      'recent', 'last', 'long', 'intermittent',
                      'hpi', 'history', 'hx', 'h/o', 'pmh']
add_temp_patterns  = ["year ago", "years ago", "yr ago", "yrs ago",
                      "month ago", "months ago", "mnth ago", "mnths ago", "mos ago",
                      "day ago", "days ago", "d ago",
                      'wk ago','wks ago']
trackmark_patterns = ['arm', 'arms', 'abnormal', 'multiple', 'many',
                     'several', 'healing', 'healed', 'old', 'diffuse',
                     'localized', 'visible', 'red', 'iv', 'fresh', 'dark',
                     'needle', 'notable']
sud_oud_patterns   = ['substance disorder', 'polysubstance use disorder', 
                      'substance abuse disorder', 'polysubstance abuse disorder',
                      'sud', 'psud', 'oud', 'polysubstance',
                      'opioid use disorder', 'opioid', 'opiate']

In [17]:
# dummy answer list
dummy_ans_ls = ["""65y/o m w cardiac procedures, or recent surgical procedures, \
admits to drinking alcohol daily for the past 10 years, denied ivdu, smokes cannabis""",
               """pt smokes cannabis, never ivdu""",
               """last ivdu was 10 days ago, snorts cocaine occasionally"""
               ]

In [18]:
ans_dict = {}
for a in dummy_ans_ls:
    if a not in ans_dict:
        for findw in negation_patterns:
            match = re.search(findw, a)
            if match is not None:
                stind = match.start()
                if any(k.lower() in a[stind:].lower() for k in keywords_list):
                    finalAns = a[stind:]
                    if finalAns:
                        ans_dict[a] = finalAns

ans_dict

{'65y/o m w cardiac procedures, or recent surgical procedures, admits to drinking alcohol daily for the past 10 years, denied ivdu, smokes cannabis': 'denied ivdu, smokes cannabis',
 'pt smokes cannabis, never ivdu': 'never ivdu'}

In [19]:
for a in dummy_ans_ls:
    if a not in ans_dict:
        for findw in add_temp_patterns:
            match = re.search(findw, a)
            if match is not None:
                enind = match.start()
                if any(k.lower() in a[:enind + len(findw)].lower() for k in keywords_list):
                    finalAns = a[:enind + len(findw)]
                    if finalAns:
                        ans_dict[a] = finalAns
ans_dict

{'65y/o m w cardiac procedures, or recent surgical procedures, admits to drinking alcohol daily for the past 10 years, denied ivdu, smokes cannabis': 'denied ivdu, smokes cannabis',
 'pt smokes cannabis, never ivdu': 'never ivdu',
 'last ivdu was 10 days ago, snorts cocaine occasionally': 'last ivdu was 10 days ago'}