#### I used the following code in Pyspark to generate and save on disk the csv file below

select_noteevents2 = spark.sql("""
SELECT text
FROM noteevents2
""")

select_noteevents2.coalesce(1).write.csv("/Volumes/EXTERNAL1/MIMICIII/select_noteevents2-v3", header=True)

select_noteevents3 = spark.sql("""
SELECT subject_id, hadm_id
FROM noteevents2
""")

select_noteevents3.coalesce(1).write.csv("/Volumes/EXTERNAL1/MIMICIII/select_noteevents2-v4", header=True)

#### then I proceed to open a new notebook to run pandas and gensim 

In [4]:
import pandas as pd
df = pd.read_csv('/Volumes/EXTERNAL1/MIMICIII/select_noteevents2-v3/part-00000-ac3edd68-1571-4e16-a2c6-ada909da0eea.csv',sep='delimiter')
df.head(5)



Unnamed: 0,text
0,"""Admission Date: [**2183-9-25**] Discha..."
1,"""Admission Date: [**2184-1-16**] Discha..."
2,"""Admission Date: [**2103-4-11**] ..."
3,"""Admission Date: [**2103-10-7**] Discha..."
4,"""\""Admission Date: [**2131-4-2**] ..."


In [11]:
df_ids = pd.read_csv('/Volumes/EXTERNAL1/MIMICIII/select_noteevents2-v4/part-00000-9b6dc461-8bf9-4ae4-b626-2156a12031ab.csv')
df_ids.head(5)

Unnamed: 0,subject_id,hadm_id
0,20007,188442
1,20007,193793
2,59883,118446
3,17043,157985
4,7019,189488


In [16]:
# Cleanning the data
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    return text

df['text2'] = df['text'].apply(preprocessor)

### Try TFIDF

In [25]:
# Transform the dataframe to a sparse Tdidf matrix
#from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Now we create the sparse matrix of tfidf values
tfidf = TfidfVectorizer(input='content',ngram_range=(1, 1),stop_words='english', min_df=10, max_df=0.8)
# I select to remove stopwords and minimun doc frequency =10 to delete very unusual words
# that only show up in less than 10 notes (out of 59k notes available) 

dtm = tfidf.fit_transform([c for c in df['text2']])
vocab = np.array(tfidf.get_feature_names())
dtm = dtm.toarray()  # convert to a regular array
vocab = np.array(vocab)
dtm.shape

# Sensibility analysis:
# dtm dimensions  # parameters

#(59652, 200879) # no constraints
#(59652, 200570) # remove stopwords
#(59652, 61913) # remove stopwords and #min_df=3
#(59652, 31482) # remove stopwords and #min_df=10
#(59652, 31471) # remove stopwords and #min_df=10 and max_df=0.8
#(59652, 22787) # remove stopwords and #min_df=20 and max_df=0.8

(59652, 31471)

In [27]:
# Top 10 notes similar to note '0' or first note in dataset (to compare results to Doc2vec)

import scipy
from scipy import spatial

cosine_results=[]
for i in range(0,dtm.shape[0]):
    result=0
    result = 1 - spatial.distance.cosine(dtm[0], dtm[i])
    cosine_results.append(float("{0:.3f}".format(result)))
    
list_index=sorted(range(len(cosine_results)), key=lambda i: cosine_results[i] , reverse=True)[:11]
for i in list_index:
    print i, cosine_results[i]


0 1.0
59628 0.404
59627 0.326
26810 0.31
59651 0.303
58420 0.299
55365 0.298
47001 0.288
22 0.279
5130 0.274
21519 0.272


### Apply doc2vec

In [28]:
# Create tokens
token_review=[]
for i in range(df['text2'].shape[0]):
    review = df['text2'][i]
    token_review.append([i for i in review.split()])

len(token_review)

59652

In [29]:
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence
    
def labelizeReviews(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized


In [30]:
sentence=labelizeReviews(token_review, "note")
len(sentence)

59652

In [31]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim import utils
from time import time

# assumptions: window is 5 words left and right, eliminate words than dont occur in
# more than 10 docs, use 4 workers for a quadcore machine. Size is the size of vector
# negative=5 implies negative sampling and makes doc2vec faster to train
#model = Doc2Vec(sentence, size=100, window=5, workers=4, min_count=5)


import random

size = 100

#instantiate our DM and DBOW models
model_dm = Doc2Vec(min_count=10, window=5, size=size, sample=1e-3, negative=5, workers=4)
#model_dbow = Doc2Vec(min_count=10, window=5, size=size, sample=1e-3, negative=5, dm=0, workers=4)

#build vocab over all reviews
model_dm.build_vocab(sentence)
#model_dbow.build_vocab(sentence)

#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
Idx=list(range(len(sentence)))

t0 = time()
for epoch in range(5):
     random.shuffle(Idx)
     perm_sentences = [sentence[i] for i in Idx]
     model_dm.train(perm_sentences)
     print(epoch)
    
elapsed=time() - t0
print("Time taken for Doc2vec training: ", elapsed, "seconds.")

0
1
2
3
4
('Time taken for Doc2vec training: ', 940.8183898925781, 'seconds.')


In [1]:
# saves the doc2vec model to be used later.
#model_dm.save('./model_doc2vec')

# open a saved doc2vec model 
import gensim
model_dm=gensim.models.Doc2Vec.load('./model_doc2vec')

In [2]:
# Find the 10 most similar words
model_dm.most_similar('melanoma')

[('sarcoma', 0.6225447058677673),
 ('adenocarcinoma', 0.5894445776939392),
 ('tumor', 0.5750119686126709),
 ('carcinoma', 0.5743094682693481),
 ('rcc', 0.5659700632095337),
 ('nsclc', 0.5529523491859436),
 ('hemangioma', 0.5420299172401428),
 ('tumors', 0.5324901342391968),
 ('cancer', 0.5308196544647217),
 ('fibrosarcoma', 0.5262499451637268)]

In [3]:
# Find 10 most similar docs
model_dm.docvecs.most_similar('note_0')

[('note_44813', 0.684360682964325),
 ('note_59628', 0.684259831905365),
 ('note_51699', 0.682969331741333),
 ('note_50856', 0.6806939840316772),
 ('note_55515', 0.667751133441925),
 ('note_1130', 0.6670060157775879),
 ('note_19485', 0.6600985527038574),
 ('note_5858', 0.6547180414199829),
 ('note_59266', 0.6545067429542542),
 ('note_58203', 0.654473602771759)]

In [7]:
import numpy as np
# Create train set and test set to use Machine Learning model
n_break=50000  # numbers of docs in training set
size = 100 # define when running doc2vec
n_final = 59652
X_train_d2v = np.zeros((n_break, size))
X_test_d2v = np.zeros((n_final - n_break, size))

for i in range(len(X_train_d2v)):
    X_train_d2v[i] = model_dm.docvecs[i]
    
for i in range(n_final - n_break):
    X_test_d2v[i] = model_dm.docvecs[i+n_break]

print(X_train_d2v.shape ,X_test_d2v.shape )

((50000, 100), (9652, 100))


In [8]:
X_train_d2v

array([[ 0.15607035,  0.22161631, -0.97751373, ..., -1.8549335 ,
        -1.06803298, -0.79157013],
       [ 0.97807539,  0.81817716,  0.79490817, ..., -1.49674761,
        -0.249111  , -0.21598285],
       [-1.35086608,  0.66754889, -0.87537473, ...,  0.65770853,
        -0.23809673, -0.38519153],
       ..., 
       [-0.27501121, -0.56780416,  0.76147199, ...,  0.62645966,
         0.7029075 ,  0.05790911],
       [-0.93016946, -0.95534432, -0.65720767, ...,  0.42433032,
         0.35649091,  0.15860732],
       [-1.3442843 ,  0.20643425, -0.85994786, ...,  0.64831275,
        -0.24515513, -2.02207661]])

In [9]:
#Infer a vector for new text => input a list of tokens
model_dm.infer_vector(['the', 'patient', 'is', 'dead', '.'])

array([-0.13242075, -0.15917873, -0.11624487,  0.09957788,  0.09828894,
        0.08276337, -0.08396963,  0.00464335, -0.08425372,  0.16465855,
       -0.19570418, -0.11671926, -0.02857404,  0.05302802,  0.04403155,
        0.03592065, -0.14035761, -0.06172403,  0.10690388, -0.02732521,
        0.27344587,  0.16867182,  0.20207992,  0.22063385, -0.20469961,
        0.1277287 ,  0.06082411, -0.02681976, -0.40795892,  0.13775024,
        0.00182548, -0.09156283, -0.07843876,  0.07043812, -0.0451562 ,
       -0.04705708, -0.06768093,  0.05321928,  0.03103751,  0.02017249,
       -0.05691034,  0.19986957, -0.17726623, -0.0629278 ,  0.04843728,
        0.17205636, -0.10879422,  0.26907292,  0.02327775, -0.18602982,
        0.10729398,  0.15359627,  0.10261659,  0.08089652,  0.08164821,
       -0.08778276,  0.14002904, -0.16427836,  0.11308956,  0.04284659,
        0.28113976, -0.12418219,  0.00551157,  0.06951306, -0.06061291,
        0.03658565,  0.18940149,  0.22684059,  0.03707127,  0.04

In [10]:
# obtain the vector for a word
model_dm['melanoma']

array([ 1.66657722, -1.36615849, -1.77005553,  0.63936967, -1.68549609,
        1.31013119,  1.57847095,  0.44661847,  1.59284735, -2.63885164,
        0.41030309,  0.97780991, -0.51760244,  1.45934939, -0.24906209,
       -0.99309033, -2.71909118,  0.85972589, -1.36879766, -0.24762669,
       -0.47404635,  0.62389529,  1.08959174, -0.44385374, -1.02291858,
       -1.37496078,  1.70679653,  3.16282368,  2.00227499,  1.00436532,
        0.09923023,  0.29878667,  0.13073418,  0.72482306, -0.46207255,
       -1.59302163, -1.15559685,  2.35874009,  1.10161841,  1.47252214,
       -1.90717649,  2.69771338, -1.37225842,  2.21341968, -1.83982718,
       -0.58266681,  0.21975961,  1.34890938, -1.00040221, -2.70551729,
        0.70345265,  0.20246744, -1.1772722 ,  1.2291683 , -1.31987274,
       -1.32114053,  0.97387874, -2.09598994,  0.17405802, -1.39674258,
       -0.47993308,  1.1498214 , -1.60952401, -2.95089245, -1.29772079,
       -1.26173258,  0.53056854, -3.08738375, -1.38280952, -0.23