### Introduction

LOINC long names only.

In [21]:
from functools import reduce
from collections import Counter
import json
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tagdocs
from gs import train
from gs import evaluate

In [2]:
df = pd.read_csv('data/loinc-labeled-text-names.csv')
df = df[df['source'] == 'LONG_COMMON_NAME']
df.sample(2)

Unnamed: 0,label,text,source,text_normed
140956,28505-6,Personal Care Knowledge Family [OMAHA],LONG_COMMON_NAME,personal care knowledge family [omaha]
23345,4269-7,Glucose.PO [Mass] of Dose,LONG_COMMON_NAME,glucose.po [mass] of dose


### Tag the documents

In [3]:
docs = tagdocs(df)
docs[:2]

[TaggedDocument(words=['needed', 'help', 'doing', 'my', 'usual', 'activities', 'because', 'of', 'my', 'fatigue', 'in', 'the', 'past', 'days', 'neuroqol'], tags=['67925-8']),
 TaggedDocument(words=['coxsackievirus', 'ab', 'units', 'volume', 'in', 'serum'], tags=['7837-8'])]

### Build the models

In [4]:
model50 = Doc2Vec(dm=0, min_count=1, size=50, window=5, hs=1, sample=0.01, negative=3, workers=4, iter=100)
model300 = Doc2Vec(dm=0, min_count=1, size=300, window=7, hs=1, sample=0.01, negative=3, workers=4, iter=100)
models = [model300, model50]

In [5]:
%time train(docs, models)

CPU times: user 11min 22s, sys: 1min 16s, total: 12min 39s
Wall time: 7min 6s


### Evaluate the models

In [28]:
# Top 2 hit rate
labels, hits = evaluate(df, model50)
print(len(hits), reduce(lambda x,y: x+y, hits))

83377 67299


In [29]:
# Top 2 hit rate
labels, hits = evaluate(df, model300)
print(len(hits), reduce(lambda x,y: x+y, hits))

83377 72733


### Run examples

In [35]:
# Mayo Clinic Test ID: EBVE 56104
# Should map to LOINC 50969-5
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
model300.docvecs.most_similar([model300.infer_vector(words)])

[('56687-7', 0.5024705529212952),
 ('23995-4', 0.48149374127388),
 ('486-1', 0.4810114800930023),
 ('34652-8', 0.47832024097442627),
 ('50969-5', 0.4708244502544403),
 ('5265-4', 0.4679912030696869),
 ('24007-7', 0.4629364609718323),
 ('25535-6', 0.4603499472141266),
 ('82740-2', 0.45872822403907776),
 ('45171-6', 0.4584764242172241)]

In [38]:
ebves = ['56687-7', '23995-4', '486-1', '34652-8', '50969-5', '5265-4', '24007-7', '25535-6']
df[df['label'].isin(ebves)]

Unnamed: 0,label,text,source,text_normed
25774,34652-8,Pancreatic islet cell Ab [Moles/volume] in Serum,LONG_COMMON_NAME,pancreatic islet cell ab [moles/volume] in serum
42028,56687-7,Pancreatic islet cell IgG Ab [Titer] in Serum ...,LONG_COMMON_NAME,pancreatic islet cell igg ab [titer] in serum ...
67161,5265-4,Pancreatic islet cell Ab [Presence] in Serum b...,LONG_COMMON_NAME,pancreatic islet cell ab [presence] in serum b...
101061,486-1,Teicoplanin [Susceptibility] by Serum bacteric...,LONG_COMMON_NAME,teicoplanin [susceptibility] by serum bacteric...
127043,25535-6,Teicoplanin [Moles/volume] in Serum or Plasma ...,LONG_COMMON_NAME,teicoplanin [moles/volume] in serum or plasma ...
142355,23995-4,Epstein Barr virus early IgG Ab [Titer] in Ser...,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in ser...
144769,24007-7,Epstein Barr virus early IgG Ab [Units/volume]...,LONG_COMMON_NAME,epstein barr virus early igg ab [units/volume]...
147554,50969-5,Epstein Barr virus early diffuse IgG Ab [Units...,LONG_COMMON_NAME,epstein barr virus early diffuse igg ab [units...


In [41]:
# The method infer_vector() has randomness. The results are not deterministic
# especially if the model is not well trained.
# Run 100 times, tally the most common
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
counter = Counter()
for i in range(100):
    docs = model300.docvecs.most_similar([model300.infer_vector(words)])
    for doc in docs:
        if doc[0] not in counter:
            counter[doc[0]] = doc[1]
        else:
            counter[doc[0]] += doc[1]
print(counter.most_common(10))

[('486-1', 44.144061386585236), ('23995-4', 40.99312576651573), ('25535-6', 34.597068160772324), ('58-8', 32.19485482573509), ('25309-6', 31.43113601207733), ('289-9', 27.521335512399673), ('40511-8', 27.26491215825081), ('14083-0', 23.657666563987732), ('23971-5', 19.69461390376091), ('3337-3', 17.277297645807266)]


In [42]:
ebves = ['486-1', '23995-4', '25535-6', '58-8', '25309-6', '289-9', '40511-8', '14083-0', '23971-5', '3337-3']
df[df['label'].isin(ebves)]

Unnamed: 0,label,text,source,text_normed
22050,25309-6,Amobarbital [Moles/volume] in Serum or Plasma,LONG_COMMON_NAME,amobarbital [moles/volume] in serum or plasma
29131,289-9,Isoniazid [Susceptibility] by Serum bactericid...,LONG_COMMON_NAME,isoniazid [susceptibility] by serum bactericid...
47262,23971-5,Epstein Barr virus early IgG Ab [Titer] in Serum,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in serum
81001,40511-8,Snowshoe hare virus Ab [Titer] in Serum by Hem...,LONG_COMMON_NAME,snowshoe hare virus ab [titer] in serum by hem...
101061,486-1,Teicoplanin [Susceptibility] by Serum bacteric...,LONG_COMMON_NAME,teicoplanin [susceptibility] by serum bacteric...
127043,25535-6,Teicoplanin [Moles/volume] in Serum or Plasma ...,LONG_COMMON_NAME,teicoplanin [moles/volume] in serum or plasma ...
127326,14083-0,Epstein Barr virus early Ab [Titer] in Serum b...,LONG_COMMON_NAME,epstein barr virus early ab [titer] in serum b...
139950,3337-3,Amobarbital [Presence] in Serum or Plasma,LONG_COMMON_NAME,amobarbital [presence] in serum or plasma
142355,23995-4,Epstein Barr virus early IgG Ab [Titer] in Ser...,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in ser...
150309,58-8,Capreomycin [Susceptibility] by Serum bacteric...,LONG_COMMON_NAME,capreomycin [susceptibility] by serum bacteric...


In [44]:
# Run the same for model50
# The method infer_vector() has randomness. The results are not deterministic
# especially if the model is not well trained.
# Run 100 times, tally the most common
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
counter = Counter()
for i in range(100):
    docs = model50.docvecs.most_similar([model50.infer_vector(words)])
    for doc in docs:
        if doc[0] not in counter:
            counter[doc[0]] = doc[1]
        else:
            counter[doc[0]] += doc[1]
print(counter.most_common(10))

[('50969-5', 45.77522426843643), ('33563-8', 41.53895753622055), ('8086-1', 35.698059648275375), ('13927-9', 30.962683022022247), ('45171-6', 30.94540160894394), ('45225-0', 28.883053302764893), ('13236-5', 27.647278010845184), ('34652-8', 24.83622545003891), ('56687-7', 22.42901784181595), ('22293-5', 18.480848729610443)]


In [45]:
ebves = ['50969-5', '33563-8', '8086-1', '13927-9', '45171-6', '45225-0', '13236-5', '34652-8', '56687-7', '22293-5']
df[df['label'].isin(ebves)]

Unnamed: 0,label,text,source,text_normed
3185,22293-5,Epstein Barr virus early Ab [Units/volume] in ...,LONG_COMMON_NAME,epstein barr virus early ab [units/volume] in ...
5466,45225-0,Pancreatic islet cell complement fixing Ab [Ti...,LONG_COMMON_NAME,pancreatic islet cell complement fixing ab [ti...
10914,13236-5,Epstein Barr virus early diffuse Ab [Presence]...,LONG_COMMON_NAME,epstein barr virus early diffuse ab [presence]...
25774,34652-8,Pancreatic islet cell Ab [Moles/volume] in Serum,LONG_COMMON_NAME,pancreatic islet cell ab [moles/volume] in serum
42028,56687-7,Pancreatic islet cell IgG Ab [Titer] in Serum ...,LONG_COMMON_NAME,pancreatic islet cell igg ab [titer] in serum ...
45717,33563-8,Pancreatic islet cell IgG Ab [Units/volume] in...,LONG_COMMON_NAME,pancreatic islet cell igg ab [units/volume] in...
66431,45171-6,Pancreatic islet cell Ab [Units/volume] in Ser...,LONG_COMMON_NAME,pancreatic islet cell ab [units/volume] in ser...
121636,8086-1,Pancreatic islet cell Ab [Units/volume] in Serum,LONG_COMMON_NAME,pancreatic islet cell ab [units/volume] in serum
144261,13927-9,Pancreatic islet cell Ab [Titer] in Serum,LONG_COMMON_NAME,pancreatic islet cell ab [titer] in serum
147554,50969-5,Epstein Barr virus early diffuse IgG Ab [Units...,LONG_COMMON_NAME,epstein barr virus early diffuse igg ab [units...


In [46]:
# Set the same seed so that infer_vector()
# return the same results on each and every run
model50.random.seed(0)
x = model50.infer_vector(['epstein', 'barr', 'virus', 'early', 'igg', 'ab'])
model50.random.seed(0)
y = model50.infer_vector(['epstein', 'barr', 'virus', 'early', 'igg', 'ab'])
delta = x - y
print(delta)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [47]:
# Similar words are not that impressive
# most_similar() runs deterministic
model300.most_similar('volume')

[('menarche', 0.2297549545764923),
 ('stifle', 0.22399133443832397),
 ('graffiti', 0.19848231971263885),
 ('foot', 0.18725237250328064),
 ('true', 0.18657299876213074),
 ('newberyite', 0.18505138158798218),
 ('realtor', 0.1836176961660385),
 ('creaky', 0.18329203128814697),
 ('pin', 0.17858730256557465),
 ('clinician', 0.1780276894569397)]

In [48]:
model300.most_similar('moles')

[('maleic', 0.2153959572315216),
 ('isospora', 0.2122819423675537),
 ('suggestions', 0.20984108746051788),
 ('spontaneous', 0.19759142398834229),
 ('dimethylacetals', 0.19339193403720856),
 ('reviewing', 0.1924281269311905),
 ('splenic', 0.18962043523788452),
 ('thirsty', 0.18697085976600647),
 ('gentisate', 0.18520410358905792),
 ('trimethyllysine', 0.18495729565620422)]

In [49]:
model300.most_similar('serum')

[('erbb', 0.2318321168422699),
 ('relation', 0.22112730145454407),
 ('axilla', 0.20663727819919586),
 ('canalicular', 0.20466859638690948),
 ('rat', 0.20236684381961823),
 ('decadienediate', 0.19803588092327118),
 ('aniline', 0.19758117198944092),
 ('herbarum', 0.19648440182209015),
 ('else', 0.1938151866197586),
 ('ffd', 0.193358913064003)]

In [50]:
model300.most_similar('plasma')

[('barrier', 0.2187296748161316),
 ('metanephrine', 0.21110078692436218),
 ('deal', 0.20796166360378265),
 ('reovirus', 0.19694575667381287),
 ('gnas', 0.19392485916614532),
 ('vibrations', 0.19261938333511353),
 ('figure', 0.19201980531215668),
 ('loquacious', 0.1853402853012085),
 ('noroxycodone', 0.1805088222026825),
 ('backpack', 0.1805063784122467)]

In [51]:
model300.most_similar('blood')

[('channel', 0.2230667769908905),
 ('cotinine', 0.20671391487121582),
 ('axial', 0.20658428966999054),
 ('talampicillin', 0.20425158739089966),
 ('pmi', 0.19924785196781158),
 ('anticonvulsant', 0.19661115109920502),
 ('hemolyzed', 0.18989506363868713),
 ('leukocytary', 0.1867826133966446),
 ('prescribe', 0.18497586250305176),
 ('mullerian', 0.1799345314502716)]

In [52]:
model300.most_similar('streptococcus')

[('phase', 0.20257756114006042),
 ('tuf', 0.19893112778663635),
 ('diethanolamine', 0.19269448518753052),
 ('pens', 0.19198058545589447),
 ('sdhb', 0.1905735433101654),
 ('dementia', 0.1904500126838684),
 ('nadolol', 0.18980106711387634),
 ('whelp', 0.1869278848171234),
 ('promotion', 0.1864202618598938),
 ('receptor', 0.18598870933055878)]

In [53]:
model300.most_similar('glucose')

[('trichophyton', 0.21472123265266418),
 ('snow', 0.21429543197155),
 ('desipramine', 0.20430803298950195),
 ('submission', 0.20406967401504517),
 ('manually', 0.19155070185661316),
 ('successful', 0.1887001395225525),
 ('listener', 0.1883692890405655),
 ('rent', 0.177993506193161),
 ('speak', 0.17647776007652283),
 ('premorbid', 0.17348122596740723)]

In [54]:
model300.most_similar('acetaminophen')

[('restaurant', 0.21371184289455414),
 ('limb', 0.20785477757453918),
 ('debauchery', 0.20261988043785095),
 ('him', 0.19850939512252808),
 ('rufinamide', 0.19388362765312195),
 ('intramural', 0.19091060757637024),
 ('beverage', 0.1870700865983963),
 ('sunburns', 0.18620546162128448),
 ('sustaining', 0.1859477311372757),
 ('dme', 0.1849375218153)]

In [55]:
# Both word and doc
# Notice that the performance on word is not good
model300.most_similar('diabetes')

[('advises', 0.2140040248632431),
 ('miss', 0.21182572841644287),
 ('hydroxycotinine', 0.20032662153244019),
 ('fundal', 0.19427016377449036),
 ('epithelial', 0.19332091510295868),
 ('statistical', 0.19302773475646973),
 ('number', 0.19147227704524994),
 ('app', 0.19068971276283264),
 ('semilente', 0.1903798133134842),
 ('prosthesis', 0.1883821189403534)]

In [56]:
model300.docvecs.most_similar([model300.infer_vector(['diabetes'])])

[('66678-4', 0.7416167855262756),
 ('62787-7', 0.6676766276359558),
 ('62791-9', 0.6426296830177307),
 ('65546-4', 0.6195601224899292),
 ('62401-5', 0.6022331118583679),
 ('62790-1', 0.5927032232284546),
 ('33248-6', 0.5924503803253174),
 ('54795-0', 0.5894614458084106),
 ('62287-8', 0.5866808295249939),
 ('66047-2', 0.5819916725158691)]

In [57]:
# The performance on doc seems much better
similar = ['66678-4', '62787-7', '62791-9', '65546-4', '62401-5', '62790-1', '33248-6', '54795-0']
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
6842,33248-6,Diabetes status [Identifier],LONG_COMMON_NAME,diabetes status [identifier]
40760,62790-1,Deprecated PhenX measure - family history of d...,LONG_COMMON_NAME,deprecated phenx measure - family history of d...
101467,65546-4,Problem related to diabetes [PhenX],LONG_COMMON_NAME,problem related to diabetes [phenx]
102098,54795-0,Diabetes mellitus in last 7 days [MDSv3],LONG_COMMON_NAME,diabetes mellitus in last 7 days [mdsv3]
104649,62787-7,PhenX domain - Diabetes,LONG_COMMON_NAME,phenx domain - diabetes
142478,62791-9,PhenX - family history of diabetes protocol,LONG_COMMON_NAME,phenx - family history of diabetes protocol
145814,62401-5,PhenX - deep venous thrombosis protocol,LONG_COMMON_NAME,phenx - deep venous thrombosis protocol
151721,66678-4,Diabetes [PhenX],LONG_COMMON_NAME,diabetes [phenx]


In [58]:
# Both word and doc
model300.most_similar('stroke')

[('major', 0.23478995263576508),
 ('banana', 0.22888852655887604),
 ('lanes', 0.21150898933410645),
 ('predicted', 0.2113720178604126),
 ('balance', 0.20252446830272675),
 ('bruton', 0.19255386292934418),
 ('sell', 0.187323659658432),
 ('candidus', 0.1852608323097229),
 ('specify', 0.18446452915668488),
 ('towne', 0.18408231437206268)]

In [59]:
model300.docvecs.most_similar([model300.infer_vector(['stroke'])])

[('8862-5', 0.6665160059928894),
 ('20562-5', 0.6552237868309021),
 ('8775-9', 0.6382864117622375),
 ('8782-5', 0.6360149383544922),
 ('76297-1', 0.6291056871414185),
 ('18178-4', 0.626961886882782),
 ('8863-3', 0.6259074211120605),
 ('8776-7', 0.62371426820755),
 ('20323-2', 0.6229154467582703),
 ('8864-1', 0.6183098554611206)]

In [60]:
# But just a single word can be ambiguous
similar = ['8862-5', '20562-5', '8775-9', '8782-5', '76297-1',
           '18178-4', '8863-3', '8776-7', '20323-2',  '8864-1']
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
1049,76297-1,Left ventricular Stroke volume index,LONG_COMMON_NAME,left ventricular stroke volume index
15641,8862-5,Right ventricular Stroke work,LONG_COMMON_NAME,right ventricular stroke work
19105,8782-5,Right ventricular Stroke volume by MR,LONG_COMMON_NAME,right ventricular stroke volume by mr
102943,20562-5,Left ventricular Stroke volume,LONG_COMMON_NAME,left ventricular stroke volume
110383,18178-4,Left ventricular Wall motion index by US,LONG_COMMON_NAME,left ventricular wall motion index by us
113016,20323-2,Left ventricular Stroke volume by US.2D+Calcul...,LONG_COMMON_NAME,left ventricular stroke volume by us.2d+calcul...
124246,8776-7,Right ventricular Stroke volume by Angiography...,LONG_COMMON_NAME,right ventricular stroke volume by angiography...
135785,8863-3,Left ventricular stroke work index,LONG_COMMON_NAME,left ventricular stroke work index
138790,8864-1,Right ventricular Stroke work index,LONG_COMMON_NAME,right ventricular stroke work index
145772,8775-9,Right ventricular Stroke volume by 2D echo,LONG_COMMON_NAME,right ventricular stroke volume by 2d echo
