In [1]:
import logging

# gensim docs recommend setting this up, but the following doesn't work in Jupyter
##logger = logging.getLogger(__name__)
##logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# found that this works - though output is to console, not Jupyter output
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
from gensim import corpora, models, similarities
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [3]:
import sqlite3
import os
#sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)
with connection:
    cur = connection.cursor()
#    cur.execute('select * from reports')
    cur.execute("select text from noteevents where category = 'Radiology'")
#    col_names = [cn[0] for cn in cur.description]
    rows = cur.fetchall()
    #print(len(rows[0]))
    #print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))

    documents = []
    for row in rows:
#        d = row[4]
        documents.append(row[0])
    print('Read', len(documents), 'documents.')

Read 522279 documents.


In [4]:
import nltk.data
from nltk.tokenize import word_tokenize
import re

counter = 0
training_sentences = []
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# really need to parallelize this - silly to only process one document at a time!
for document in documents:
    # convert lines of underscores into period so they trigger a sentence boundary with NLTK
    document = re.sub( '___+', '.', document)

    counter += 1
    # Load the punkt tokenizer pre-trained on english text to improve
    # sentence splitting, would need to create custom tokenizer that understands
    # radiology report sections. However, I think this may be good enoug for now.
    output = sent_tokenizer.tokenize(document)
    
    # NLTK sentence splitter; handles punctuation better, but don't like how
    # "we'll" becomes two words "we" and "'ll"
    output = [word_tokenize(o.lower()) for o in output]    
    # alternative std python split function - this is much faster than the NLTK splitter
    #output = [o.lower().split() for o in output]

    if (counter % 10000 == 0):
        logger.info('Processed ' + str(counter) + ' documents.')

    for o in output:
        training_sentences.append(o)
    
#pp.pprint(training_sentences)
print('Total documents:', counter, '(should agree with previous number of documents.)')
print('Total sentences:', len(training_sentences))

Total documents: 522279 (should agree with previous number of documents.)
Total sentences: 8721909


In [5]:
from gensim.models import word2vec, Phrases
retrain = True
if(retrain):
    # Set values for various parameters, starting point provided by 
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 10   # Minimum word count, default is 5
    num_workers = 4       # Number of threads to run in parallel
    context = 20          # Context window size - set to large as some report sections 
                          #    aren't prose but are instead mostly shorthand notation  
    # default sample = 1e-3   # Downsample setting for frequent words

    # from the gensim documentation:
    #   Note that there is a gensim.models.phrases module which lets you automatically detect
    #    phrases longer than one word. Using phrases, you can learn a word2vec model where 
    #    “words” are actually multiword expressions, such as new_york_times or financial_crisis:
    bigram_transformer = Phrases(training_sentences)
    trigram_transformer = Phrases(bigram_transformer[training_sentences])
    #model = Word2Vec(bigram_transformer[sentences], size=100, ...)

    model = word2vec.Word2Vec(trigram_transformer[bigram_transformer[training_sentences]], \
                              workers=num_workers, \
                              size=num_features, \
                              min_count = min_word_count, \
                              window = context)
    model.save(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))
else:
    model = word2vec.Word2Vec.load(os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'word2vec_full_radiology.model'))

print('Model ready for use.')

Model ready for use.


In [7]:
model.most_similar("pulmonary_embolism", topn=5)
#model.most_similar("hypertension", topn=5)

[('pulmonary_embolus', 0.8787556886672974),
 ('pulmonary_emboli', 0.7133843898773193),
 ('subsegmental', 0.6346239447593689),
 ('subsegmental_branches', 0.600606381893158),
 ('pe', 0.5958702564239502)]

In [8]:
model.most_similar_cosmul("pulmonary_embolism", topn=5)

[('pulmonary_embolus', 0.939376950263977),
 ('pulmonary_emboli', 0.8566913604736328),
 ('subsegmental', 0.8173112273216248),
 ('subsegmental_branches', 0.8003024458885193),
 ('pe', 0.797934353351593)]

In [9]:
model.doesnt_match("embolism fracture pulmonary lung".split())

'fracture'

In [11]:
model.similarity('metatarsal', 'pulmonary')

-0.018186284898593707

In [None]:
model['pulmonary']