In [13]:
import psycopg2
import pandas as pd
import sys
import spacy
import re
import stanfordnlp
import time
from spacy.pipeline.dep_parser import Language
import scispacy
from tqdm import tqdm
from heuristic_tokenize import sent_tokenize_rules 

In [29]:
# update these constants to run this script
OUTPUT_DIR = '~/Downloads/Repos_0/NLP/data/MIMIC/' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh
MIMIC_NOTES_FILE = '~/Downloads/Repos_0/NLP/data/MIMIC/NOTEEVENTS.csv.gz' #this is the path to mimic data if you're reading from a csv. Else uncomment the code to read from database below

In [132]:
start = time.time()
tqdm.pandas()

print('Begin reading notes')


# Uncomment this to use postgres to query mimic instead of reading from a file
# con = psycopg2.connect(dbname='mimic', host="/var/run/postgresql")
# notes_query = "(select * from mimiciii.noteevents);"
# notes = pd.read_sql_query(notes_query, con)
notes = pd.read_csv(MIMIC_NOTES_FILE, index_col = 0, nrows=1000)
print(set(notes['CATEGORY'])) # all categories

# notes = notes[notes['CATEGORY'] == category]
print('Number of notes: %d' %len(notes.index))
notes['ind'] = list(range(len(notes.index)))
notes.columns = ['subject_id', 'hadm_id', 'chartdate', 'charttime', 'storetime',
       'category', 'description', 'cgid', 'iserror', 'text', 'ind']
notes

Begin reading notes
{'Discharge summary'}
Number of notes: 1000


Unnamed: 0_level_0,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,ind
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
174,22532,167853,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,0
175,13702,107527,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...,1
176,13702,167118,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...,2
177,13702,196489,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...,3
178,26880,135453,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...,4
...,...,...,...,...,...,...,...,...,...,...,...
1158,15198,137838,2184-10-21,,,Discharge summary,Report,,,Admission Date: [**2184-10-14**] Discha...,995
1159,2712,198266,2101-02-15,,,Discharge summary,Report,,,Admission Date: [**2101-2-10**] ...,996
1160,2712,157537,2103-07-18,,,Discharge summary,Report,,,Admission Date: [**2103-7-17**] ...,997
1161,18511,123903,2191-03-03,,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,998


In [133]:
notes.text.iloc[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [134]:
#setting sentence boundaries
@Language.component('sbd_component')
def sbd_component(doc):
    # print('priting docccccccccccccc: ', doc) 
    for i, token in enumerate(doc[:-2]):
        # print(i, token)
        # define sentence start if period + titlecase token
        if token.text == '.' and doc[i+1].is_title:
            doc[i+1].sent_start = True
        if token.text == '-' and doc[i+1].text != '-':
            doc[i+1].sent_start = True
    # print(doc)
    return doc

#convert de-identification text into one token
def fix_deid_tokens(text, processed_text):
    deid_regex  = r"\[\*\*.{0,15}.*?\*\*\]" 
    if text:
        indexes = [m.span() for m in re.finditer(deid_regex,text,flags=re.IGNORECASE)]
    else:
        indexes = []
    for start,end in indexes:
        print('processed text: ', processed_text)
        # processed_text.merge(start_idx=start,end_idx=end)
    return processed_text
    

def process_section(section, note, processed_sections):
    # perform spacy processing on section
    processed_section = nlp(section['sections'])
    # processed_section = fix_deid_tokens(section['sections'], processed_section)
    processed_sections.append(processed_section)

def process_note_helper(note):
    # print(f'note in note_helperrrrrrrrrrr: {note}')
    # split note into sections
    note_sections = sent_tokenize_rules(note)
    processed_sections = []
    section_frame = pd.DataFrame({'sections':note_sections})
    # print(f'secton_frameeeeeeeee: {section_frame}')
    section_frame.apply(process_section, args=(note,processed_sections,), axis=1)
    return(processed_sections)

def process_text(sent, note):
    sent_text = sent['sents'].text
    if len(sent_text) > 0 and sent_text.strip() != '\n':
        if '\n' in sent_text:
            sent_text = sent_text.replace('\n', ' ')
        note['text'] += sent_text + '\n'  

def get_sentences(processed_section, note):
    # get sentences from spacy processing
    sent_frame = pd.DataFrame({'sents': list(processed_section['sections'].sents)})
    sent_frame.apply(process_text, args=(note,), axis=1)

def process_note(note):
    try:
        # print(f'Printing first note............................................................... {note.shape}')
        note_text = note['text'] #unicode(note['text'])
        note['text'] = ''
        # print(f'NOte text--------- {note_text}')
        processed_sections = process_note_helper(note_text)
        # print(f'processed sections~~~~~~~~~~~~~~~~~~: {processed_sections}')
        ps = {'sections': processed_sections}
        ps = pd.DataFrame(ps)
        ps.apply(get_sentences, args=(note,), axis=1)
        # print(f'last {note}')
        return note 
    except Exception as e:
        
        # print ('error', e)
        pass

In [135]:
# NOTE: `disable=['tagger', 'ner'] was added after paper submission to make this process go faster
# our time estimate in the paper did not include the code to skip spacy's NER & tagger
nlp = spacy.load('en_core_sci_md', disable=['tagger','ner'])
nlp.add_pipe('sbd_component', before='parser')  


<function __main__.sbd_component(doc)>

In [136]:
formatted_notes = notes.progress_apply(process_note, axis=1)
formatted_notes

100%|██████████| 1000/1000 [11:19<00:00,  1.47it/s]


Unnamed: 0_level_0,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,ind
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
174,22532.0,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-\n7\n-\n16**]\nDischar...,0.0
175,13702.0,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-\n6\n-\n2**]\nDischarg...,1.0
176,13702.0,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-\n5-\n4**]\nDischarge ...,2.0
177,13702.0,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-\n7\n-\n21**]\nDischar...,3.0
178,26880.0,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-\n3-\n3**]\nDischarge ...,4.0
...,...,...,...,...,...,...,...,...,...,...,...
1158,15198.0,137838.0,2184-10-21,,,Discharge summary,Report,,,Admission Date: [**2184-\n10-\n14**]\nDischarg...,995.0
1159,2712.0,198266.0,2101-02-15,,,Discharge summary,Report,,,Admission Date: [**2101-\n2-\n10**]\nDischarge...,996.0
1160,2712.0,157537.0,2103-07-18,,,Discharge summary,Report,,,Admission Date: [**2103-\n7\n-\n17**]\nDischar...,997.0
1161,18511.0,123903.0,2191-03-03,,,Discharge summary,Report,,,Admission Date: [**2191-\n2-\n23**]\nDischarge...,998.0


In [139]:
category = 'Discharge_summary'
with open('processed.txt','w') as f:
    for text in formatted_notes['text']:
        if text != None and len(text) != 0 :
            f.write(text)
            f.write('\n')

end = time.time()
print (end-start)
print ("Done formatting notes")

964.7206695079803
Done formatting notes


In [140]:
notes.text.iloc[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [65]:
notes

Unnamed: 0_level_0,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,ind
ROW_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
174,22532,167853,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,0
175,13702,107527,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...,1
176,13702,167118,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...,2
177,13702,196489,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...,3
178,26880,135453,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...,4
...,...,...,...,...,...,...,...,...,...,...,...
1158,15198,137838,2184-10-21,,,Discharge summary,Report,,,Admission Date: [**2184-10-14**] Discha...,995
1159,2712,198266,2101-02-15,,,Discharge summary,Report,,,Admission Date: [**2101-2-10**] ...,996
1160,2712,157537,2103-07-18,,,Discharge summary,Report,,,Admission Date: [**2103-7-17**] ...,997
1161,18511,123903,2191-03-03,,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,998
