In [1]:
#############################################
## Fit NMF topic model to MIMIC-III clinical note data --- goal is to characterize topics in discharge notes
##
## Author: Chris Meaney 
## Date: December 2023
#############################################

In [2]:
###################
## Package dependencies
###################

## Numerics
import numpy as np

## Data wrangling
import pandas as pd

## Gensim word2vec models and utilities
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

## For timing processes
from time import time

## Environment characteristics
import session_info

In [3]:
###################
## Jupyter display options
###################
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
###################
## Import Data
###################
datadir = "C:/Users/ChristopherMeaney/Desktop/tmp/Vector_SSL/Data/"
X_pkl_path = datadir + "X.pkl"

X = pd.read_pickle(X_pkl_path)
X.shape

(21177, 21)

In [5]:
## Inspect DataFrame
X.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB_DATE,ADMIT_DATE,DISCH_DATE,DOD_DATE,DOD_FLAG,GENDER,ADMIT_AGE,NUM_ADMIT,ICD9_CODE,ICD9_CODE_LIST,NUM_ICD9_CODE,DISCH_2_DOD_,DISCH_2_DOD_30D,DISCH_2_DOD_30D_SURVTIME,TEXT,TEXT_LEN,SHORT_TITLE,LONG_TITLE,DOD_FLAG_
0,682,188382,2036-12-26,2118-08-19,2118-09-03,2119-08-03,False,F,81.69863,1,4472,"[4472, 41071, 42830, 5990, 4280, 5849, 40391, ...",9,334.0,False,30,Admission Date: [**2118-8-19**] ...,10041,Rupture of artery,Rupture of artery,True
1,786,117381,2033-05-14,2115-12-29,2116-01-10,2116-08-31,False,M,82.679452,1,4472,"[4472, 4538, 00845, 49121, 42731, 2875, 2851, ...",9,234.0,False,30,Admission Date: [**2115-12-29**] Discharg...,14642,Rupture of artery,Rupture of artery,True
2,16181,190902,2116-07-18,2196-02-14,2196-02-17,2202-11-22,False,M,79.630137,1,4472,"[4472, 496, 4019, 4422, 53081, 3051, V173, V12...",9,2469.0,False,30,Admission Date: [**2196-2-14**] ...,10935,Rupture of artery,Rupture of artery,True
3,60809,131743,2048-08-27,2123-04-08,2123-05-01,NaT,True,F,74.660274,1,4472,"[4472, 5849, 59382, 591, 5934, 2851, E8792, 59...",16,,False,30,Admission Date: [**2123-4-8**] D...,9302,Rupture of artery,Rupture of artery,False
4,695,177128,2093-05-14,2178-08-05,2178-08-13,2178-09-16,False,F,85.282192,2,5771,"[5771, 48241, 42731, 2765, 2762, 4240, 3970, 4...",9,34.0,False,30,Admission Date: [**2178-8-5**] Discharg...,9829,Chronic pancreatitis,Chronic pancreatitis,True


In [6]:
######################
## Estimate word2vec embeddings using Gensim
######################

In [7]:
##
## Class for feeding in clinical notes from corpus
##
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X.TEXT:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [8]:
##
## Train word2vec model
##
sentences = MyCorpus()

t0 = time()

model = gensim.models.Word2Vec(sentences=sentences,    ## corpus iterator
                              min_count=25,            ## Drop words from dict/vocab if occurrence frequency less than threshold
                              vector_size=300,         ## Size/dimension of vector embedding
                              window=5,                ## Window around context word, when determining TCM counts
                              sg=1,                    ## sg=0 implies word2vec skip-gram model; else word2vec cbow model
                              hs=1,                    ## hs=1 implies use of hierarchical softmax training
                              seed=123456              ## random seed
                              )
t1 = time()

runtime = t1-t0
runtime

1753.4088599681854

In [9]:
######################
##
## Inspect properties of the learned word embeddings
##
######################

In [10]:
######################
## Word/token similarity tasks
######################

In [11]:
for w in model.wv.most_similar(positive=['norvasc'], topn=5):
    print(w)

('lipitor', 0.8466351628303528)
('cozaar', 0.8359431028366089)
('diovan', 0.8308151960372925)
('zocor', 0.8248066902160645)
('atenolol', 0.8202351927757263)


In [12]:
for w in model.wv.most_similar(positive=['cancer'], topn=5):
    print(w)

('ca', 0.7153682708740234)
('breast', 0.6985347270965576)
('melanoma', 0.6790291666984558)
('cancers', 0.6641904711723328)
('aunt', 0.6073940992355347)


In [13]:
for w in model.wv.most_similar(positive=['oncology'], topn=5):
    print(w)

('bmt', 0.6313751339912415)
('onc', 0.6132813692092896)
('hematology', 0.61019366979599)
('endocrinology', 0.55660480260849)
('dermatology', 0.5399655103683472)


In [14]:
#######################
## Word analogy tasks
#######################

In [15]:
for w in model.wv.most_similar(positive=['hypertension', 'diabetes'], negative=['norvasc'], topn=5):
    print(w)

('mellitus', 0.6961442232131958)
('dyslipidemia', 0.5625420808792114)
('dm', 0.5499495267868042)
('hyperlipidemia', 0.5374734401702881)
('noninsulin', 0.532331109046936)


In [16]:
######################
## Notebook Environment
######################

In [17]:
session_info.show()