In [1]:
import locale
import glob
import requests
import tarfile
import sys
import codecs
import gensim
import os
import collections
import smart_open
import random
import csv
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from nltk import word_tokenize



In [2]:
os.chdir('H:/GEMINI/Results/WATSON/')
f = open('nlp.data.csv', newline = "")
full = csv.reader(f)
full = list(full)

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []
for line_no, line in enumerate(full):
    tokens = gensim.utils.to_unicode(line[1].lower()).split()
    words = tokens[1:]
    tags = [line_no]
    split = ['train', 'test'][line_no//200]
    contrast = line[2]
    alldocs.append(SentimentDocument(words, tags, split, contrast))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

299 docs: 200 train-sentiment, 99 test-sentiment


In [3]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

In [43]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d100,n5,w5,mc2,s0.001,t4)
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)


In [45]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [4]:
model = gensim.models.Doc2Vec(train_docs, alpha = 0.1, size = 20, min_alpha = 0.025)

In [5]:
for epoch in range(200):
    if epoch % 20 == 0:
        print ('Now training epoch %s'%epoch)
    model.train
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  

Now training epoch 0
Now training epoch 20
Now training epoch 40
Now training epoch 60
Now training epoch 80
Now training epoch 100
Now training epoch 120
Now training epoch 140
Now training epoch 160
Now training epoch 180


In [78]:
model.most_similar("enhanced")

[('scanning', 0.9731926918029785),
 ('administration', 0.9674476981163025),
 ('following', 0.9669854640960693),
 ('collimation', 0.9592838287353516),
 ('intravenously', 0.9511013031005859),
 ('thorax', 0.9480822086334229),
 ('5-mm', 0.9443165063858032),
 ('volumemetric', 0.9365969896316528),
 ('non-contrast', 0.9362979531288147),
 ('contrast.comparison', 0.9300181865692139)]

In [11]:
model.infer_vector(test_docs[0][0])

array([ 0.14848386,  0.26648   ,  0.11515955, -0.55529165,  0.38165605,
        0.04689731, -1.03322279,  0.94672775, -0.45723036, -0.12654819,
       -0.21645531,  0.44267449,  0.2207628 , -0.37069741, -0.0468344 ,
       -0.53290778,  0.19843969, -0.48372751,  0.34176606,  0.73595148], dtype=float32)

In [10]:
test_docs[0][0]

['noncontrast',
 'ct',
 'thoraxindication',
 'aml',
 'with',
 'neutropenic',
 'fever.no',
 'previous',
 'for',
 'comparisonfindings:',
 'right',
 'tunneled',
 'central',
 'line',
 'in',
 'situ',
 'with',
 'distal',
 'tip',
 'in',
 'the',
 'svc.',
 'bilateral',
 'axillary',
 'lymphadenopathy',
 'measures',
 'up',
 'to',
 '1',
 'cm',
 '.',
 'no',
 'mediastinal',
 'lymphadenopathy',
 'identified',
 'within',
 'the',
 'limits',
 'of',
 'this',
 'noncontrast',
 'study.',
 'there',
 'is',
 'a',
 'small',
 'pericardial',
 'effusion.',
 'no',
 'pleural',
 'effusions.',
 'the',
 'heart',
 'is',
 'mildly',
 'enlarged',
 'for',
 'age.limited',
 'unenhanced',
 'images',
 'of',
 'the',
 'upper',
 'abdomen',
 'reveal',
 'hepatosplenomegaly.',
 'no',
 'airspace',
 'consolidation',
 'or',
 'pulmonary',
 'nodules',
 'identified.',
 'there',
 'is',
 'linear',
 'band',
 'atelectasis',
 'within',
 'the',
 'apical',
 'segment',
 'of',
 'the',
 'right',
 'lower',
 'lobe',
 'medially',
 'and',
 'at',
 'the',