In [1]:
from pathlib import Path
import re, string
from gensim.models.word2vec import Word2Vec

In [2]:
# Patterns for leading and trailing punctuation marks
punc_begin = re.compile('^[%s]+' % re.escape(string.punctuation))
punc_end = re.compile('[%s]+$' % re.escape(string.punctuation))

In [3]:
def sentences():
    sentences = []
    files = Path('data/merck-topics-2017-11-23/').glob('**/*')
    for file in files:
        with file.open() as filein:
            lines = filein.readlines()
            for line in lines:
                tokens = line.lower().split()
                tokens = [token.strip() for token in tokens]
                tokens = [re.sub(punc_begin, '', token) for token in tokens]
                tokens = [re.sub(punc_end, '', token) for token in tokens]
                tokens = [token for token in tokens if len(token) > 0]
                sentences.append(tokens)
    return sentences

In [4]:
model = Word2Vec(sentences())

In [5]:
model.wv.most_similar(positive=['cancer'])

[('cancers', 0.7418906688690186),
 ('carcinoma', 0.6450203657150269),
 ('adenocarcinoma', 0.6149548292160034),
 ('tumor', 0.5961090326309204),
 ('melanoma', 0.57428377866745),
 ('prostate', 0.5718021392822266),
 ('metastases', 0.52112877368927),
 ('lymphoma', 0.5182576775550842),
 ('tumors', 0.517723560333252),
 ('sarcoma', 0.4998699128627777)]

In [6]:
model.wv.most_similar(positive=['malignant'])

[('transformation', 0.773547887802124),
 ('benign', 0.7065020799636841),
 ('adenomas', 0.7040392160415649),
 ('cancerous', 0.6933955550193787),
 ('melanoma', 0.6685755252838135),
 ('adenocarcinomas', 0.6517176628112793),
 ('carcinomas', 0.6425170302391052),
 ('high-grade', 0.6397116184234619),
 ('nonadenomatous', 0.6377661228179932),
 ('lymphomas', 0.6366579532623291)]

In [7]:
model.wv.most_similar(positive=['sugar'])

[('glucose', 0.7360876202583313),
 ('carbohydrates', 0.6713556051254272),
 ('lipids', 0.6708924770355225),
 ('galactose', 0.6640836596488953),
 ('cholesterol', 0.6597653031349182),
 ('sucrose', 0.6494026780128479),
 ('fats', 0.6487394571304321),
 ('ammonia', 0.6243336200714111),
 ('triglycerides', 0.6217348575592041),
 ('a1c', 0.6080040335655212)]

In [8]:
model.wv.most_similar(positive=['diabetes'])

[('dm', 0.6624680757522583),
 ('mellitus', 0.6483187675476074),
 ('insipidus', 0.6136149168014526),
 ('obesity', 0.5705541372299194),
 ('diabetic', 0.5631070137023926),
 ('hypertension', 0.5522923469543457),
 ('nephrogenic', 0.549591600894928),
 ('alcoholism', 0.5180000066757202),
 ('dyslipidemia', 0.5172838568687439),
 ('atherosclerosis', 0.5154365301132202)]

In [9]:
model.wv.most_similar(positive=['psoriasis'])

[('scaling', 0.8512666821479797),
 ('planus', 0.8204973936080933),
 ('lichen', 0.8050974607467651),
 ('eczema', 0.8031511306762695),
 ('rosacea', 0.8011435270309448),
 ('blistering', 0.7950164675712585),
 ('vulgaris', 0.7882848978042603),
 ('dermatitis', 0.7845450639724731),
 ('pustular', 0.7834339141845703),
 ('sclerosus', 0.772196888923645)]

In [10]:
model.wv.most_similar(positive=['100'])

[('300', 0.873751699924469),
 ('200', 0.8681505918502808),
 ('150', 0.8614224195480347),
 ('250', 0.8501552939414978),
 ('120', 0.8404273986816406),
 ('0.3', 0.8343183994293213),
 ('400', 0.8233770132064819),
 ('1000', 0.8209776878356934),
 ('1.0', 0.814278781414032),
 ('0.8', 0.8128980398178101)]

In [11]:
model.wv.most_similar(positive=['papilloma'])

[('intraductal', 0.8791331052780151),
 ('glenn', 0.8191472291946411),
 ('lipoma', 0.8154205083847046),
 ('pkd', 0.8106021285057068),
 ('gk', 0.8105160593986511),
 ('cystadenocarcinoma', 0.810095489025116),
 ('skene', 0.8083108067512512),
 ('intracorneal', 0.8072136640548706),
 ('fc', 0.805770754814148),
 ('sln', 0.8032029271125793)]

In [12]:
model.wv.most_similar(positive=['hpv'])

[('papillomavirus', 0.8030089139938354),
 ('human', 0.7550371885299683),
 ('poliovirus', 0.6889576315879822),
 ('herpesviruses', 0.6879425048828125),
 ('vzv', 0.6727120280265808),
 ('herpesvirus', 0.6724347472190857),
 ('varicella-zoster', 0.6710348725318909),
 ('gondii', 0.6703339219093323),
 ('meningitidis', 0.6673886775970459),
 ('papillomaviruses', 0.6496900916099548)]

In [13]:
model.wv.most_similar(positive=['nmda'])

[('n-methyl-d-aspartate', 0.9082049131393433),
 ('memantine', 0.9052895307540894),
 ('nilotinib', 0.8685386180877686),
 ('entacapone', 0.8617355823516846),
 ('asparaginase', 0.8521819710731506),
 ('il-17', 0.8509945273399353),
 ('apremilast', 0.847725510597229),
 ('catechol', 0.847434401512146),
 ('5-alpha-reductase', 0.8469164371490479),
 ('integrin', 0.8448348641395569)]

In [14]:
model.wv.most_similar(positive=['5-ht'])

[('norepinephrine', 0.8518733978271484),
 ('dopamine', 0.8506501913070679),
 ('adrenergic', 0.8490609526634216),
 ('5-hydroxytryptamine', 0.8467927575111389),
 ('reuptake', 0.825350284576416),
 ('antagonists', 0.8016381859779358),
 ('alpha-adrenergic', 0.7977383136749268),
 ('bradykinin', 0.7960352301597595),
 ('cholinergic', 0.7957416772842407),
 ('5-ht2', 0.7938324809074402)]