In [125]:
#%matplotlib inline

import sys
import re
import csv
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from nltk.stem.snowball import SnowballStemmer

import nltk
from nltk import word_tokenize
from nltk import FreqDist, Text
from nltk import bigrams, trigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

#from tqdm import tqdm 
#from glob import glob

# Data extraction

In [126]:
# Set file output path (current directory), default id
id = 0
input_path = "./input/"
output_path = "./output/"
data_path = input_path + "resident_data/"

In [127]:
# Instantiate variables
list_addendums = []

Create pandas dataframes containing ABR guide, resident report data

In [128]:
# create dataframe & import ABR data taken from 2018 core exam study guide
df_abr = pd.DataFrame()
df_abr = pd.read_csv(input_path + "abr_output_items_edited.csv",usecols=[0,1], dtype=object)
df_abr.columns = ['section','item']

# create dataframe & import CSV data containing report text
df_reports = pd.DataFrame()
df_reports = pd.read_csv(input_path + "montage_bgg_anon.csv", usecols=[4,8], dtype=object)
df_reports.columns = ['modality','report_text']

# remove duplicate reports
df_reports = df_reports.drop_duplicates(subset='report_text', keep='first')

In [129]:
# Function for cleaning each report - IMPORTANT !
#     first: convert everything to lowercase

    # tag/replace exceptional reports:
    # * reports lacking "FINDINGS" (i.e. chest section CXR)
    # * consult to reference
    # * association of exams
    # * event report
    # -- will ignore for now


def extract_impression(text):
    
    text = text.lower()
    
    if 'findings:' not in text:
        if 'This study was initially nominated' in text:
            return ['a \"consult to reference\" report']
        elif 'assocation of exams' in text:
            return ['association of exams']
        elif 'event report' in text:
            return ['an event report']
        else:
            return ["chest section x-ray report"]

    
    # Part 1: carve out the impression chunk
    text = text.split('impression:')[1].strip()
    
    # Remove everything following "PLAN:" or "Dictated by:" or consult verbiage
    if 'plan:' in text:
        text = text.split('plan:')[0].rstrip()
    elif 'dictated by:' in text:
        text = text.split('dictated by:')[0].rstrip()
    elif 'electronically signed' in text:
        text = text.split('electronically signed')[0].rstrip()
    elif 'the findings, conclusions and recommendations' in text:
        text = text.split('the findings, conclusions and recommendations')[0].rstrip()

    # Handle addendums
    if 'addendum' in text:
        text_addendum_split = text.split('addendum',maxsplit=1)
        text = text_addendum_split.pop(0).rstrip()
        list_addendums.append( text_addendum_split[0].rstrip() )

          
    # remove commas
    text = text.replace(',',' ')
    
    # detect if report was dictated using enumeration
    if ('1.' in text[0:3]) or ('1) ' in text[0:3]):
        p = re.compile('\s*\n*[0-9]{1,2}[\.\)]\s')
        text = p.split(text)
    else:
        p = re.compile('\.\n+')
        text = p.split(text)
    
    # Finally:
    # filter() method allows a function performed over each element of iterable
    #     in this case, 'None' indicates that all empty values are dropped (remove blank lines)
    text = list(filter(None, text ))
    
    return text

# function for removing custom stop word list -- currently hard-coded

def remove_stop_words(stringlist):
    output = []
    for s in stringlist:
        # split string into words
        tokenized = s.split()
        set_stop_words = {"**","::","patient","patient's","successful","placement", "findings","mild","moderate","severe","trace","small","large","well", "improving","healing", "healed","interval", "progressive","right","left","french","research", "examination","stable","unchanged", "redemonstration", "redemonstrated","new","aud","i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"}
        filtered_words = [word for word in tokenized if word.lower() not in set_stop_words]
        output.append(' '.join(filtered_words) )
    return output

def unravel_list(target):
    for l in list_imp:
        # remove annoying carriage returns "(\r")
        #l = l.replace('\r','')
        #l = l.replace('\n','')
        yield l.strip()

In [130]:
# Clean report texts:
df_reports['report_text'] = df_reports['report_text'].apply(extract_impression)

At this point, our resident data gets granular to the impression-item level.

Will take this opportunity to remove additional things:
* stray newlines
* "findings communication" statements
* periods

In [131]:
list_impression_statements = []

for entries in df_reports['report_text'].tolist():
    for i in entries:
        if 'discussed' in i:
            i = i.split('discussed')[0]
        if 'communicate' in i:
            i = i.split('communicate')[0]
        i = i.replace('.','')
        i = i.replace('\n',' ').strip()
        list_impression_statements.append(i)

In [132]:
# saving to CSV file

# report impression block --> list of impression-item strings 
df_reports.to_csv(output_path+"initial_impressions.csv",encoding="utf-8")

# addendum items --> individual rows
with open(output_path+'addendums.csv', 'w',encoding='utf-8',newline='') as f:
    writer = csv.writer(f)
    for i in list_addendums:
        writer.writerow([i])
f.close()

# impression item string --> individual rows
with open(output_path+'split_impressions.csv', 'w',encoding='utf-8',newline='') as f:
    writer = csv.writer(f)
    for i in list_impression_statements:
        writer.writerow([i])
f.close()

# NLTK eval

Will manually create set of "stop" words to help out.

In [144]:
set_stop_words = {"(",")","**","::",":","patient","patient's","successful","placement", "finding","findings","mild","moderate","severe","trace","gross","small","large","well", "improving","healing", "healed","interval", "progressive","right","left","bilateral","lesion","represent","consistent","may","including","french","research", "examination","stable","unchanged", "redemonstration", "redemonstrated","likely","new","aud","i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't","techniques"}

Let's look at the ABR exam guide data
* much here that isn't useful
* removed sections
    * physics
    * RISE
    * nuc med 

In [134]:
df_abr['item'] = df_abr['item'].str.strip()
list_abr_items = df_abr['item'].tolist()

# convert list into a single string
string_abr_items = ' '.join(list_abr_items)

# tokenize words
words = word_tokenize(string_abr_items)

fdist1 = FreqDist(words)

In [135]:
fdist1.most_common(n=10)

[('and', 377),
 ('(', 200),
 (')', 198),
 ('of', 146),
 ('disease', 115),
 ('Normal', 79),
 ('syndrome', 78),
 ('including', 76),
 ('tumors', 67),
 ('Congenital', 64)]

In [136]:
# Take existing data and remove Remove "stop" words
list_abr_item_no_stop = []
for w in words:
    if w.lower() not in set_stop_words:
        list_abr_item_no_stop.append(w)

In [137]:
fdist2 = FreqDist(list_abr_item_no_stop)

In [138]:
fdist2.most_common(n=20)

[('disease', 115),
 ('Normal', 79),
 ('syndrome', 78),
 ('tumors', 67),
 ('Congenital', 64),
 ('cell', 45),
 ('cyst', 41),
 ('masses', 40),
 ('Benign', 38),
 ('Trauma', 38),
 ('anomalies', 35),
 ('tumor', 33),
 ('sign', 33),
 ('Lymphoma', 32),
 ('Malignant', 29),
 ('diseases', 28),
 ('venous', 27),
 ('Vascular', 27),
 ('cysts', 27),
 ('carcinoma', 26)]

Let's look at the resident data. First some basics...

In [145]:
# convert list-impression-items into a single string
string_reports = ' '.join(list_impression_statements)
# tokenize words
words = word_tokenize(string_reports)

fdist3 = FreqDist(words)

In [146]:
fdist3.most_common(n=10)

[('of', 15812),
 ('the', 12935),
 ('no', 7369),
 ('and', 6950),
 ('with', 5701),
 ('right', 5608),
 ('in', 5408),
 ('left', 5359),
 ('to', 3854),
 ('a', 3608)]

Ok. Not terribly informative - a lot of meaningless words here.
Will try some cleanup:
* remove stop words
* remove previously-tagged "difficult" impressions

In [147]:
# Take existing data and remove Remove "stop" words
def is_difficult_item(i):
    if 'chest section x-ray' in i:
        return True
    elif 'association of exams' in i:
        return True
    elif 'consult to reference' in i:
        return True
    elif 'an event report' in i:
        return True
    else:
        return False
        
list_impression_statements_no_stop = []
for i in list_impression_statements:
    if is_difficult_item(i):
        continue
    tokenized = i.split()
    filtered_words = [word for word in tokenized if word.lower() not in set_stop_words]
    list_impression_statements_no_stop.append(' '.join(filtered_words) )

In [148]:
# convert list-impression-items into a single string
string_reports_no_stop = ' '.join( list_impression_statements_no_stop )
# tokenize words
words = word_tokenize(string_reports_no_stop)

fdist4 = FreqDist(words)

In [149]:
fdist4.most_common(n=20)

[('no', 7369),
 ('fracture', 2682),
 ('evidence', 2337),
 ('acute', 2253),
 ('change', 1735),
 ('normal', 1533),
 ('disease', 1530),
 ('pulmonary', 1392),
 ('lobe', 1017),
 ('bowel', 954),
 ('tube', 945),
 ('lower', 945),
 ('distal', 931),
 ('spine', 918),
 (':', 869),
 ('recommend', 863),
 ('cm', 854),
 ('upper', 811),
 ('chest', 795),
 ('within', 793)]

In [153]:
finder = BigramCollocationFinder.from_words(words)

finder.apply_freq_filter(10)

print(finder.nbest(bigram_measures.pmi,30))

[('ca', "n't"), ('james', 'berger'), ('architectural', 'distortion'), ('gamma', 'probe'), ('ejection', 'fraction'), ('the', 'american'), ('referring', 'clinician'), ('subacromial', 'spur'), ('society', 'recommends'), ('qualitative', 'lugano'), ('paracolic', 'gutter'), ('pes', 'planus'), ('standard', 'deviations'), ('dental', 'caries'), ('laryngeal', 'penetration'), ('mucus', 'plugging'), ('likelihood', 'ratio'), ('****', 'acr'), ('submitted', 'hematopathology'), ('appendicular', 'skeleton'), ('extraprostatic', 'spread'), ('tree', 'bud'), ('utilizing', 'digital'), ('tibiofibular', 'syndesmosis'), ('hallux', 'valgus'), ('attending', 'observations'), ('parathyroid', 'adenoma'), ('er', 'physician'), ('necrotizing', 'enterocolitis'), ('kink', 'discontinuity')]


In [155]:
finder2 = TrigramCollocationFinder.from_words(words)

finder2.apply_freq_filter(20)

print(finder2.nbest(trigram_measures.pmi,30))

[('nontunneled', 'dual', 'lumen'), ('dual', 'lumen', 'hohn'), ('cardiac', 'blood', 'pool'), ('differential', 'diagnosis', 'includes'), ('data', 'set', 'thus'), ('set', 'thus', 'contain'), ('source', 'data', 'set'), ('native', 'source', 'data'), ('meet', 'ata', 'guidelines'), ('rotator', 'cuff', 'tear'), ('us', '12-24', 'months'), ('replace', 'initial', 'conclusions'), ('leftward', 'midline', 'shift'), ('accuracy', 'second-opinion', 'interpretation'), ('reduced', 'internally', 'fixated'), ('compartment', 'predominant', 'tricompartmental'), ('recommendations', 'made', 'facility'), ('provided', 'teleradiologist', 'dr'), ('12-24', 'months', 'low'), ('performed', 'based', 'upon'), ('suspicion', '>', '=1'), ('correlation', 'point', 'tenderness'), ('swallowing', 'mechanism', 'abnormal'), ('condition', 'time', 'comparison'), ('please', 'see', 'separate'), ('necessary', 'provided', 'images'), ('superior', 'vena', 'cava'), ('speech', 'pathology', 'note'), ('history', 'necessary', 'provided'), ('

In [106]:
stemmer = SnowballStemmer('english')

# for each term, split it into words (could be just one word) and stem each word
stemmed_terms = ( (stemmer.stem(word) for word in str(s).split() ) for s in list_articles)

# add 'match anything after it' expression to each of the stemmed words
# join result into a pattern string
regex_patterns = [''.join( clean_articles(stem) + '.*' for stem in term) for term in stemmed_terms]

print("regex patterns acquired")


NameError: name 'list_articles' is not defined

In [None]:
for sentence in list_impressions:
    match_obs = ( re.search(pattern, str(sentence), flags=re.IGNORECASE) for pattern in regex_patterns)
    matches = [m.group(0) for m in match_obs if m]
    #print(matches)

In [None]:
for sentence in list_impressions:
    # regex_patterns maps directly onto terms (strictly speaking it's one-to-one and onto)
    for term, pattern in zip(list_articles, regex_patterns):
        if re.search(pattern, str(sentence), flags=re.IGNORECASE):
            # process term (put it in the db)
            print('TERM: {0} FOUND IN: {1}'.format(term, sentence))