In [8]:
#import relevant libraries
import sqlite3
import re
import csv

# ACR Core Exam Guide Parsing
I need to cull together a list of diagnoses. 

Will work on scraping this from the ACR core-exam guide. The guide is organized as follows:
* Breast Imaging (p 5)
* Cardiac Imaging (p 7)
* Gastrointestinal Imaging (p 12)
* Interventional Radiology (p 18)
* Musculoskeletal Imaging  (p 20)
* Neuroradiology  (p 30)
* Nuclear Radiology  (p 45)
* Pediatric Radiology  (p 57)
* Physics (p 77)
* Radioisotope Safety Examination (RISE) (p 94)
* Reproductive/Endocrine Imaging and Therapy  (p 98)
* Noninterpretive Skills (p 100)
* Thoracic Imaging (p 102)
* Ultrasound  (p 110)
* Urinary Imaging (p 132)
* Vascular Imaging (p 135)

In [25]:
# ACR guide PDF has been converted to text already using a pdfminer
# Will import the content of the guide as a string

data_path = "./input/"
out_path = "./output/"
acr_file_name = "CORE_Exam_Study_Guide_2018.txt"

huge_list = []

# sections will be defined as:
#     Breast Imaging - 0
#     Cardiac Imaging - 1
#     Gastrointestinal Imaging - 2
#     Interventional Radiology - 3
#     Musculoskeletal Imaging - 4
#     Neuroradiology - 5
#     Nuclear Radiology - 6
#     Pediatric Radiology - 7
#     Physics - 8
#     Radioisotope Safety Examination (RISE) - 9
#     Reproductive/Endocrine Imaging and Therapy - 10
#     Noninterpretive Skills - 11
#     Thoracic Imaging - 12
#     Ultrasound - 13
#     Urinary Imaging - 14
#     Vascular Imaging - 15

# variables for keeping track of line, guide-section, and subheaders
temp_line = ""
just_broken = False
section = 0
is_h1 = 0
is_sub1_h = 0
current_h1 = ""
current_level = 0 # 0 = section, 1 = sub, 2 = sub1, 3 = sub2, 4 = sub3, 5 = sub4

# string for filtering sections
# "stop" patterns - to be removed
pattern = 'C o r e   E x a m i n a t i o n   S t u d y   G u i d e|C o r e  E x a m i n a t i o n  S t u d y  G u i d e|Page [0-9]{1,3}|-{3,}'


def screen(t):
    if re.search(pattern, line) or line == "\n" or not line:
        return True
    else:
        return False

# Check to see if line represents a section indicator
# returns 0/1 (false/false)
def check_h1(t):
    if (t=="Breast Imaging" or t=="Cardiac Imaging" or t=="Gastrointestinal Imaging" or t=="Interventional Radiology" or t=="Musculoskeletal Imaging" or t=="Neuroradiology" or t=="Nuclear Radiology" or t=="Pediatric Radiology" or t=="Physics" or t=="Radioisotope Safety Examination (RISE)" or t=="Reproductive/Endocrine Imaging and Therapy" or t=="Noninterpretive Skills" or t=="Thoracic Imaging" or t=="Radioisotope Safety Examination (RISE)" or t=="Ultrasound" or t=="Urinary Imaging" or t=="Vascular Imaging"):
        return True
    else:
        return False

# Check to see if broken line
# this is judged by presence of either section indicator or "bullet point"
def check_broken_line(t):    
    if ( check_h1(t) ):
        return False
    elif re.match('\(*[0-9a-z]+\)+\s+',t):
        return False
    else:
        return True

# Assess difference between list-levels (ascending only)
# uses the global variable 'current_level'
def level_difference(new_level):
    div_str = ""
    for x in range(current_level - new_level):
        div_str = div_str + '</div>'
    return div_str


def adding_html_tags(t, prior_level, new_level):
    lookup_level = { 1 : "sub", 2 : "sub1", 3 : "sub2", 4 : "sub3", 5 : "sub4",   }
    
    t = '<div class="item">' + t 
    
    if prior_level < new_level:
        t = '<div class="' + lookup_level[new_level] + '">' + t
    elif prior_level > new_level:
        t = level_difference(new_level) + t
        #print("backing out {} levels".format(prior_level-new_level),file=html_file)
    return t

def check_for_level_errors(prior_level, new_level):
    pass

# Analyze a line (t), return the appropriate category (sub1-4) + level #
# need to also deal with exception case of sub1 "i) " being categorized as sub2 (level3)
# for this, will need to keep track of prior level
def categorize(t, level, prior_was_h):
    
    is_h_level = False
    new_text = t
    new_level = level
    
    # sub4 matching
    if re.match('\([a-z]\)\s+', t):
        new_level = 5
        new_text = adding_html_tags(t,level,new_level)
    
    # sub3 matching
    elif re.match('\([0-9]+\)\s+', t):
        new_level = 4
        new_text = adding_html_tags(t,level,new_level)
    
    # sub2 matching
    elif re.match('[ivx]{1,4}\)',t):
        
        # exception testing 
        # if prior_was_h == True, treat as if sub1
        if (prior_was_h):
            new_level = 2
            new_text = adding_html_tags(t,level,new_level)
            is_h_level = False
            # NEED to return is_h_level to False  
        else:
            new_level = 3
            new_text = adding_html_tags(t,level,new_level)
    
    # sub1 matching
    elif re.match('[a-z]\)\s+', t):
        new_level = 2
        
        # exception detection #
        if ( re.match('[a-z]\)\s+', t)[0][0] == "h"):
            is_h_level = True
        
        new_text = adding_html_tags(t,level,new_level)

    # sub matching
    elif re.match('^[0-9]+\)\s+', t):
        new_level = 1
        new_text = adding_html_tags(t,level,new_level)
    
    # handling for broken lines (don't change level or string t)
    else:
        pass
    
    
    return new_text, new_level, is_h_level

# open HTML file & append appropriate headers
html_file = open(out_path + "output.html", "w", encoding="utf8")
print('<html>\n<head>\n\t<link rel="stylesheet" type="text/css" href="index.css">\n</head>\n<body>', file=html_file)

with open(data_path+acr_file_name,"r", encoding="utf8") as f:
    
    for cnt, line in enumerate(f):
        
        # skip the introductory pages
        if not cnt:
            pass
        else:
            
            line = line.strip()
            
            # screen for "stop" keywords (see 'pattern' above) and skip that line
            if screen(line):
                pass
            else:
                    
                # next, check for section header
                if check_h1(line):
                    
                    # if there is something left in temp_line, print
                    if temp_line:
                        print(temp_line+'</div>', file=html_file)

                    # get css-friendly section name
                    s = line.replace(" ", "_").lower()

                    # print header - don't need to worry about temp_line in these cases
                    line = level_difference(0) + '</div><div class="section ' + s + '">\n\t<h1 id=' + line + '">' + line +'</h1>'
                    print("transitioning from level {}... \n{}".format(current_level,line),file=html_file)

                    # update level
                    just_broken = False
                    current_level = 0

                # all other non-section= headers...
                else:
                    
                    if check_broken_line(line):
                        # print("broken line",file=html_file)
                        temp_line = temp_line + " " + line
                        just_broken = True
                        
                    else:
                        
                        if current_level != 0:
                            print(temp_line+'</div>', file=html_file)

                        just_broken = False

                        line, current_level, is_sub1_h = categorize(line, current_level, is_sub1_h)

                        temp_line = line
                        # print(line,file=html_file)
                
    print("</body></html>",file=html_file)
    
html_file.close()

In [None]:
# create connection - connect() method -- 
# either connect a file or a end-memory database
# conn = sqlite3.connect('dictations.db')

# testing - database from memory (vs from file)
conn = sqlite3.connect(':memory:')

# create cursor, to execute SQL commands
c = conn.cursor()



In [None]:
# create table 
c.execute("""CREATE TABLE studies (
            accession integer,
            modality text,
            exam text,
            description,
            cpt text,
            report text
            )""")

# commit the current transaction
conn.commit()

In [None]:
# open CSV file
with open(data_path+'montage_bgg.csv','r',encoding='utf8') as csvfile:
    
#   create reader, which can iterate by row
    reader = csv.DictReader(csvfile)
    
    # create a blank list
    to_db = []
    
    for i in reader:
        
        # extract impression from the full dictated report
        text = i['Report Text'] 
        imp = text[text.find("IMPRESSION:"):]
        
        # append a tuple of data to the list "to_db"
        to_db.append((i['Accession Number'],i['Modality'],i['Exam Code'],i['Exam Description'],i['CPT Code'],imp))
        
#print(to_db[0:5])
sql = "INSERT INTO studies (accession, modality, exam, description, cpt, report) VALUES (?,?,?,?,?,?)"
c.executemany(sql, to_db)
conn.commit()

In [None]:
def clean_impression(text):
    # The following operations clean the impression text.
    # The results is a list of impression points
    # The idea being that each impression point is potentially classifiable as diagnosis

    # First:
    # remove the impression' first line
    # remove leading/trailing whitespace 
    # split by newline characters
    text = text.split('IMPRESSION:')[1]

    # Next: 
    # remove everything following "PLAN:" or "Dictated by:"
    text = text.split('PLAN:')[0].rstrip()
    text = text.split('Dictated by:')[0].rstrip()
    text = text.split('Electronically signed')[0].rstrip()

    # Next: 
    # remove all leading enumeration (e.g. "1. ")
    # then split by newline and strip
    text = re.sub('\n[0-9]{1,2}\.\s',"",text).strip().split('\n')

    # Finally:
    # remove empty lines
    # filter() method allows a function performed over each element of iterable
    #     in this case, 'None' indicates that all empty values are dropped
    text = list(filter(None, text))
    
    return text

In [None]:
# Extract the impression test
imp_db = []

for row in to_db[410:540]:
    print( clean_impression( str(row[5]) ))
    #imp_db.append( clean_impression(row[5]) )


In [None]:
print(imp_db)