In [None]:
#this code finds anesthesia types from the clinical narratives
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function


import nltk, re, pprint
from nltk import word_tokenize, pos_tag, sent_tokenize, ToktokTokenizer
from nltk.corpus import wordnet as wn
import io

import urllib
from urllib import urlopen

import numpy as np#for math processing ,need
from numpy import arange #to handle ranges 
import networkx as nx # brings up node configs

import unicodedata
import random
from collections import defaultdict
import csv
import pandas
import string
import xlrd


After importing all relevant modules, the next few cell import the dataset from the database, and drop any potential duplicate notes in the report (which would have duplicate Note IDs). For the purpose of analyzing as much data as possible, we have now included patients with multiple reports for classification at this stage of the analysis for the sake of thoroughness (a change from how the first 130 sample post-op notes were classified with the unmodified classifier).


In [None]:
print (df.shape) #check for size 
print (df.NOTE.dtype)

In [None]:
df = df.drop_duplicates(subset='NOTE_DEID') 
df = df.reset_index()
del df['index']


In [None]:
df.dtypes # see what type NOTE and NOTE CODE are in currently

Here, we are adding a space where matches in the text for anesthesia type can be recorded as a way to tabulate results, and attempt to change the type of note to string. 

In [None]:
df = df.rename(index=str, columns={"NOTE": "NOTE"}) # may need to add similar code for NOTE_CODE if utilizing that as well
df['NOTE'] = df['NOTE'].astype('str') 
df["General"] = np.nan
df["Regional"] = np.nan
df["Local"] = np.nan
df = df.fillna('0')
df.head(2)

In [None]:
df.dtypes 

Before any type of string processing, we will add the lexicon of anesthesia types and relevant terms.

In [None]:
General = ["general"]
Regional = ["regional", "epidural", "nerve", "block", "nerve block", "nerveblock", "infraclavicular", "axillary", "sciatic", "femoral", "popliteal", "spinal"]
Local = ["local", "infiltration", "mac"]
#regexp = re.compile(r'|nerve|epidural|infraclavicular|axillary|sciatic nerve|femoral|popliteal|( )?\bblock')

To organize the classifier, I have moved most functions out of the for loop and into separately defined functions, as shown below;

Exclude takes out all punctuation in a string, including periods, run AFTER sentences are tokenized to ensure periods are not lost. 

get_context pulls all information we think is relevant to anesthesia based on the format of the note. We are looking at a number of notes that have different formats and will thus have different locations for where the anesthesia information is found, according to some main rules:
1) if the note has headers, including one for anesthesia, the classifier will pull the information between the anesthesia header and the beginning of the next header.
2) if the note has the "anesthesia:" header but no other header, one of two options will happen
    A) if there are sentences in the note, the classifier will extract the first sentence that includes the "anesthesia:" header.
    B) if there are no sentences in the note, the classifier will extract the remainder of the report after the "anesthesia:" header.
3) If there is no "anesthesia" header, but anesthesia is mentioned in the note, the classifier will extract all sentences that have anesthesia in them.
4) if "anesthesia" is not mentioned at all in the note, then the classifier will extract the entire note to be processed, which is typically used when the note is an "Anesthesia Procedure Note".


clean combines the above two functions into one, along with including some preprocessing functions, including lowercasing, removing digits and punctuation (to ensure sent_tokenize works correctly in the get_context function, the digits are removed before it and punctuation is removed after).

classify then looks at each word in a trimmed list, and if it finds the word in the general/local/regional dictionaries, it will add a 1 to the relevant column set up earlier. 


In [None]:
def exclude(word):
    punct = set(string.punctuation) #this requires an "import string" since the punctuation is stored in there. I turn it into a set for faster searches. 
    return ''.join(ch for ch in word if ch not in punct) #strips out any punctuation by testing each character to see if it's not in the set of punctuation.


def get_context(processed_note): 
    context = []
    #rule 1: look for headers
    if 'anesthesia:'in processed_note:
        after = processed_note.split('anesthesia:')[1]#WILL CUT OUT ANY information before "anesthesia:"
        if ":" in after: # checks for a second header
            secondcolon = after.index(":")
            context.append(after[:secondcolon]) # extract information between the anesthesia header and the next header
        elif ":" not in after: # if there is no second header
            if "." in after: #if there are sentences in the note:
                context.append(sent_tokenize(after)[0]) #extract the first sentence that includes the header
            else:
                context.append(after) # extract the remainder of the report right after the header
    elif "anesthesia" in processed_note:#if there is no anesthesia header present
        [context.append(sentence) for sentence in sent_tokenize(processed_note) if 'anesthesia' in sentence] # extract any sentence with anesthesia in it
    elif "anesthesia" not in processed_note: # if the word anesthesia is not mentioned at all:
        context.append(processed_note) # extract the whole report for evaluation. 
    return " ".join(context) # output is one concentrated string


def clean(text):
    text = text.lower() #this makes all the text in the report lowercase
    nums = set('0123456789') #for this extraction, I didn't need to look at numbers, so this is me putting all the digit characters into a set (which is really fast for finding things in).
    text = ''.join(ch for ch in text if ch not in nums) #this takes out any digits by testing whether it's in the set I just defined. 
    text = get_context(text) #this will run it through the context extraction, to help me filter out what information would be most relevant based on the structure of the note. This is helpful for when you want to reduce the noise present in other parts of the note.
    text1 = exclude(text) #this will strip out any punctuation, see the function a couple lines up. 
    return text1 #cleaned up text


def classify(key): #for when input is the full string, could be slow
    for word in Regional:
        if word in key:
             df.loc[index,"Regional"] = 1 
    for word in General:
        if word in key:
             df.loc[index,"General"] = 1 
    for word in Local:
        if word in key:
             df.loc[index,"Local"] = 1 


This is the actual classification of the anesthesia.
From the dataframe set up earlier, it will take the Note from each row, run it through the preprocessing function(s) defined earlier to extract the relevant anesthesia information, and then classify and record the results in the general/regional/local columns according to what is found in this new list as a way to tabulate the results.



In [None]:
for index, row in df.iterrows():
    note = row[str("NOTE")]
    note = clean(note)
    classify(note)

df = df.fillna(0)

In [None]:
# quick check to ensure values have been recorded correctly
df.head(2)

writes certain columns of dataframe to an Excel output file. 
format1 ensures the NoteDEID is preserved in its integer form instead of being written in exponential/scientific notation. 

In [None]:
import xlsxwriter
df = df[["PAT_DEID","NOTE_DEID", "NOTE_CODE", "General","Regional","Local"]]
writer = pandas.ExcelWriter('anes_results_full.xls',engine = "xlsxwriter")
df.to_excel(writer, sheet_name = 'Sheet1')
workbook = writer.book
worksheet = writer.sheets["Sheet1"]
format1 = workbook.add_format({'num_format': '0'})
worksheet.set_column('C:C', None, format1)
writer.save()