In [34]:
import logging

# gensim docs recommend setting this up, but the following doesn't work in Jupyter
##logger = logging.getLogger(__name__)
##logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# found that this works - though output is to console, not Jupyter output
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [35]:
from gensim import corpora, models, similarities
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [36]:
import sqlite3
import os
#sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)
with connection:
    cur = connection.cursor()
#    cur.execute('select * from reports')
    cur.execute("select text from noteevents where category = 'Radiology' and text like '%pulmonary%'")
#    col_names = [cn[0] for cn in cur.description]
    rows = cur.fetchall()
    #print(len(rows[0]))
    #print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))

    documents = []
    for row in rows:
#        d = row[4]
        documents.append(row[0])
    print('Read', len(documents), 'documents.')

Read 139834 documents.


In [37]:
import nltk.data
from nltk.tokenize import word_tokenize
import re

counter = 0
training_sentences = []
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# really need to parallelize this - silly to only process one document at a time!
for document in documents:
    # convert lines of underscores into period so they trigger a sentence boundary with NLTK
    document = re.sub( '___+', '.', document)

    counter += 1
    # Load the punkt tokenizer pre-trained on english text to improve
    # sentence splitting, would need to create custom tokenizer that understands
    # radiology report sections. However, I think this may be good enoug for now.
    output = sent_tokenizer.tokenize(document)
    
    # NLTK sentence splitter; handles punctuation better, but don't like how
    # "we'll" becomes two words "we" and "'ll"
    output = [word_tokenize(o.lower()) for o in output]    
    # alternative std python split function - this is much faster than the NLTK splitter
    #output = [o.lower().split() for o in output]

    if (counter % 1000 == 0):
        logger.info('Processed ' + str(counter) + ' documents.')

    for o in output:
        training_sentences.append(o)
    
#pp.pprint(training_sentences)
print('Total documents:', counter, '(should agree with previous number of documents.)')
print('Total sentences:', len(training_sentences))

Total documents: 139834 (should agree with previous number of documents.)
Total sentences: 2393790


In [38]:
from gensim.models import word2vec, Phrases
retrain = True
if(retrain):
    # Set values for various parameters, starting point provided by 
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 5    # Minimum word count, default is 5
    num_workers = 4       # Number of threads to run in parallel
    context = 20          # Context window size - set to large as some report sections 
                          #    aren't prose but are instead mostly shorthand notation  
    # default sample = 1e-3   # Downsample setting for frequent words

    # from the gensim documentation:
    #   Note that there is a gensim.models.phrases module which lets you automatically detect
    #    phrases longer than one word. Using phrases, you can learn a word2vec model where 
    #    “words” are actually multiword expressions, such as new_york_times or financial_crisis:
    bigram_transformer = Phrases(training_sentences)
    trigram_transformer = Phrases(bigram_transformer[training_sentences])
    #model = Word2Vec(bigram_transformer[sentences], size=100, ...)

    model = word2vec.Word2Vec(trigram_transformer[bigram_transformer[training_sentences]], \
                              workers=num_workers, \
                              size=num_features, \
                              min_count = min_word_count, \
                              window = context,
                              negative=0, hs=1) # use hierarchal softmax
    model.save(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_mimic3.model'))
else:
    model = word2vec.Word2Vec.load(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_mimic3.model'))

print('Model ready for use.')

Model ready for use.


In [42]:
#model.most_similar("pulmonary_embolism", topn=5)
model.most_similar("hypertension", topn=5)

[('arterial_hypertension', 0.6557725667953491),
 ('diabetes', 0.5016984343528748),
 ('copd', 0.45498859882354736),
 ('end-_stage_renal', 0.4476727247238159),
 ('artery', 0.44748634099960327)]

In [30]:
model.most_similar_cosmul("pulmonary_embolism", topn=5)

[('pulmonary_embolus', 0.8827810883522034),
 ('recurrence', 0.8298359513282776),
 ('cervical_spine_fracture', 0.8218733668327332),
 ('intraarterial_filling_defects', 0.8035216927528381),
 ('aortic_dissection', 0.7832710146903992)]

In [31]:
model.doesnt_match("embolism fracture pulmonary lung".split())

'fracture'

In [32]:
model.similarity('heart', 'pulmonary')

-0.056489989317015918

In [24]:
model['pulmonary']

array([ 0.12226088, -0.09986808,  0.15960208,  0.15728202, -0.08715954,
       -0.07322888,  0.10454237,  0.02746133, -0.214715  ,  0.25200212,
        0.28033361,  0.07777113,  0.14724384,  0.0305615 ,  0.20448543,
        0.23157008,  0.05041288, -0.06596602,  0.07892799,  0.05021708,
        0.16758342,  0.07257286, -0.01204597, -0.01905949, -0.08105014,
       -0.29252869, -0.39353162,  0.00816245,  0.03349591,  0.13277838,
       -0.08022843, -0.00757302,  0.19938199,  0.08181747,  0.06211767,
        0.07793283, -0.25264481,  0.0972663 ,  0.01344704, -0.12573668,
       -0.20251122,  0.00797517, -0.04851846,  0.02214212, -0.21393654,
       -0.00628168,  0.05975638,  0.1591824 ,  0.18776827,  0.0649076 ,
        0.07142299,  0.01134679,  0.1108847 ,  0.18774967, -0.02545563,
       -0.17305388,  0.26548243, -0.24695414,  0.10063036,  0.14536436,
       -0.42197403, -0.04089982,  0.12709691, -0.00258544,  0.10918406,
        0.07942567, -0.06314755,  0.11329138, -0.10015193,  0.13