### Introduction

LOINC short names and long names.

In [1]:
from functools import reduce
from collections import Counter
import json
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tagdocs
from gs import train
from gs import evaluate

In [2]:
df = pd.read_csv('data/loinc-labeled-text-names.csv')
df.sample(2)

Unnamed: 0,label,text,source,text_normed
153534,66567-9,Image name Spine PhenX,SHORTNAME,image name spine phenx
14962,74670-1,Cocaine+Metabolites [Presence] in Saliva (oral...,LONG_COMMON_NAME,cocaine+metabolites [presence] in saliva (oral...


### Tag the documents

In [3]:
docs = tagdocs(df)
docs[:2]

[TaggedDocument(words=['needed', 'help', 'doing', 'my', 'usual', 'activities', 'because', 'of', 'my', 'fatigue', 'in', 'the', 'past', 'days', 'neuroqol'], tags=['67925-8']),
 TaggedDocument(words=['coxsackievirus', 'ab', 'units', 'volume', 'in', 'serum'], tags=['7837-8'])]

### Build the models

In [4]:
model50 = Doc2Vec(dm=0, min_count=1, size=50, window=5, hs=1, sample=0.01, negative=3, workers=4, iter=100)
model300 = Doc2Vec(dm=0, min_count=1, size=300, window=7, hs=1, sample=0.01, negative=3, workers=4, iter=100)
models = [model300, model50]

In [5]:
%time train(docs, [model50, model300])

CPU times: user 20min 14s, sys: 2min 27s, total: 22min 42s
Wall time: 13min 42s


### Evaluate the models

In [6]:
# Top 2 hit rate
labels, hits = evaluate(df, model50)
print(len(hits), reduce(lambda x,y: x+y, hits))

157577 95819


In [7]:
# Top 2 hit rate
labels, hits = evaluate(df, model300)
print(len(hits), reduce(lambda x,y: x+y, hits))

157577 125257


### Run some examples

In [8]:
# Mayo Clinic Test ID: EBVE 56104
# Should map to LOINC 50969-5
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
model300.docvecs.most_similar([model300.infer_vector(words)])

[('47436-1', 0.4600881338119507),
 ('24316-2', 0.4451015591621399),
 ('83095-0', 0.44373759627342224),
 ('35398-7', 0.4420682489871979),
 ('33479-7', 0.42958277463912964),
 ('23971-5', 0.42857277393341064),
 ('5158-1', 0.4173392653465271),
 ('47437-9', 0.4164949953556061),
 ('13237-3', 0.4150683879852295),
 ('48638-1', 0.4141130745410919)]

In [10]:
similar = ['47436-1', '24316-2', '83095-0', '35398-7', '33479-7',
           '23971-5', '5158-1', '47437-9', '13237-3', '48638-1']
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
2330,24316-2,EBV VCA IgG+IgM Pnl Ser,SHORTNAME,ebv vca igg+igm pnl ser
21696,5158-1,Epstein Barr virus capsid IgG Ab [Titer] in Se...,LONG_COMMON_NAME,epstein barr virus capsid igg ab [titer] in se...
34027,23971-5,EBV EA IgG Titr Ser,SHORTNAME,ebv ea igg titr ser
35159,83095-0,EBV VCA+EA IgG Ser Ql IA,SHORTNAME,ebv vca+ea igg ser ql ia
36306,83095-0,Epstein Barr virus capsid+early IgG Ab [Presen...,LONG_COMMON_NAME,epstein barr virus capsid+early igg ab [presen...
40114,35398-7,EBV DNA Mouth Ql PCR,SHORTNAME,ebv dna mouth ql pcr
47262,23971-5,Epstein Barr virus early IgG Ab [Titer] in Serum,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in serum
53594,24316-2,Epstein Barr virus capsid IgG and IgM panel - ...,LONG_COMMON_NAME,epstein barr virus capsid igg and igm panel - ...
59659,48638-1,Cryoglobulin type [Identifier] in Serum by Imm...,LONG_COMMON_NAME,cryoglobulin type [identifier] in serum by imm...
68192,48638-1,Cryoglob Typ Ser IFE,SHORTNAME,cryoglob typ ser ife


In [11]:
# The method infer_vector() has randomness. The results are not deterministic
# especially if the model is not well trained.
# Run 100 times, tally the most common
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
counter = Counter()
for i in range(100):
    docs = model300.docvecs.most_similar([model300.infer_vector(words)])
    for doc in docs:
        if doc[0] not in counter:
            counter[doc[0]] = doc[1]
        else:
            counter[doc[0]] += doc[1]
counter.most_common(10)

[('24316-2', 48.64946645498276),
 ('47436-1', 48.33875101804733),
 ('23971-5', 42.6216177046299),
 ('83095-0', 40.47811743617058),
 ('5158-1', 35.447747230529785),
 ('47066-6', 35.27552109956741),
 ('7886-5', 32.2407329082489),
 ('35398-7', 31.34234368801117),
 ('7885-7', 29.453456193208694),
 ('20491-7', 24.863735914230347)]

In [12]:
# The results seem to have better hits than models trained
# on long names only as the top 10 are all about EBV
similar = [_[0] for _ in counter.most_common(10)]
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
2330,24316-2,EBV VCA IgG+IgM Pnl Ser,SHORTNAME,ebv vca igg+igm pnl ser
13838,47066-6,Epstein Barr virus capsid IgM Ab [Titer] in Bo...,LONG_COMMON_NAME,epstein barr virus capsid igm ab [titer] in bo...
21696,5158-1,Epstein Barr virus capsid IgG Ab [Titer] in Se...,LONG_COMMON_NAME,epstein barr virus capsid igg ab [titer] in se...
34027,23971-5,EBV EA IgG Titr Ser,SHORTNAME,ebv ea igg titr ser
35159,83095-0,EBV VCA+EA IgG Ser Ql IA,SHORTNAME,ebv vca+ea igg ser ql ia
36306,83095-0,Epstein Barr virus capsid+early IgG Ab [Presen...,LONG_COMMON_NAME,epstein barr virus capsid+early igg ab [presen...
40114,35398-7,EBV DNA Mouth Ql PCR,SHORTNAME,ebv dna mouth ql pcr
47262,23971-5,Epstein Barr virus early IgG Ab [Titer] in Serum,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in serum
53594,24316-2,Epstein Barr virus capsid IgG and IgM panel - ...,LONG_COMMON_NAME,epstein barr virus capsid igg and igm panel - ...
60759,7886-5,Epstein Barr virus capsid IgM Ab [Units/volume...,LONG_COMMON_NAME,epstein barr virus capsid igm ab [units/volume...


In [13]:
# Model50
words = tokenize('Epstein-Barr Virus (EBV), IgG Antibody to Early Antigen, Serum')
counter = Counter()
for i in range(100):
    docs = model50.docvecs.most_similar([model50.infer_vector(words)])
    for doc in docs:
        if doc[0] not in counter:
            counter[doc[0]] = doc[1]
        else:
            counter[doc[0]] += doc[1]
counter.most_common(10)

[('23972-3', 34.919836938381195),
 ('24316-2', 33.34568375349045),
 ('23971-5', 27.544857621192932),
 ('47434-6', 26.79735666513443),
 ('7886-5', 25.865068554878235),
 ('42497-8', 21.475822925567627),
 ('7936-8', 18.781601190567017),
 ('24007-7', 18.045230448246002),
 ('22295-0', 15.935540854930878),
 ('31371-8', 15.49030590057373)]

In [14]:
similar = [_[0] for _ in counter.most_common(10)]
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
1429,7936-8,JEV RNA Ser Ql PCR,SHORTNAME,jev rna ser ql pcr
2330,24316-2,EBV VCA IgG+IgM Pnl Ser,SHORTNAME,ebv vca igg+igm pnl ser
13770,47434-6,EBV EA-D Ab Ser Ql,SHORTNAME,ebv ea-d ab ser ql
20568,24007-7,EBV EA IgG Ser-aCnc,SHORTNAME,ebv ea igg ser-acnc
21951,23972-3,EBV EA IgG Titr CSF,SHORTNAME,ebv ea igg titr csf
25453,31371-8,EBV EA IgG CSF-aCnc,SHORTNAME,ebv ea igg csf-acnc
34027,23971-5,EBV EA IgG Titr Ser,SHORTNAME,ebv ea igg titr ser
40550,47434-6,Epstein Barr virus early diffuse Ab [Presence]...,LONG_COMMON_NAME,epstein barr virus early diffuse ab [presence]...
43576,31371-8,Epstein Barr virus early IgG Ab [Units/volume]...,LONG_COMMON_NAME,epstein barr virus early igg ab [units/volume]...
47262,23971-5,Epstein Barr virus early IgG Ab [Titer] in Serum,LONG_COMMON_NAME,epstein barr virus early igg ab [titer] in serum


In [15]:
model300.most_similar('volume')

[('clumps', 0.23260585963726044),
 ('expectation', 0.2140417993068695),
 ('radiotherapy', 0.21192626655101776),
 ('sc', 0.2101733386516571),
 ('phage', 0.20290859043598175),
 ('basis', 0.19850529730319977),
 ('tropheryma', 0.19565574824810028),
 ('eta', 0.19346044957637787),
 ('huntington', 0.18593721091747284),
 ('triprolidine', 0.18499647080898285)]

In [16]:
model300.most_similar('moles')

[('stemphylium', 0.2266412377357483),
 ('corresponds', 0.22417479753494263),
 ('dcb', 0.2197464555501938),
 ('colds', 0.216793030500412),
 ('hypertension', 0.21315120160579681),
 ('schmorl', 0.21014277637004852),
 ('ghost', 0.21007652580738068),
 ('vaccines', 0.20926731824874878),
 ('botg', 0.20909249782562256),
 ('pa', 0.20865176618099213)]

In [17]:
model300.most_similar('serum')

[('loudly', 0.24894078075885773),
 ('bedfast', 0.24159587919712067),
 ('mycobacteria', 0.21509405970573425),
 ('neopterin', 0.21297617256641388),
 ('itis', 0.2102903425693512),
 ('psych', 0.2046247124671936),
 ('softball', 0.19912400841712952),
 ('ird', 0.19900789856910706),
 ('inspiration', 0.1957864761352539),
 ('tmstprange', 0.18929478526115417)]

In [18]:
model300.most_similar('plasma')

[('cucurbitaceous', 0.2404169887304306),
 ('prussian', 0.2152346521615982),
 ('drain', 0.2114323377609253),
 ('pathological', 0.2026025503873825),
 ('fearing', 0.20049935579299927),
 ('reflecting', 0.20041228830814362),
 ('selected', 0.19779998064041138),
 ('discharge', 0.19205184280872345),
 ('longbeachae', 0.19103780388832092),
 ('tetrofosmin', 0.1906396597623825)]

In [19]:
model300.most_similar('blood')

[('hgd', 0.23734012246131897),
 ('absorbed', 0.2357843816280365),
 ('hum', 0.22418248653411865),
 ('semliki', 0.21614745259284973),
 ('men', 0.21577033400535583),
 ('alcohols', 0.21456559002399445),
 ('trps', 0.20623928308486938),
 ('sentences', 0.20320552587509155),
 ('shots', 0.1983593851327896),
 ('molar', 0.1969108283519745)]

In [20]:
model300.most_similar('streptococcus')

[('protocol', 0.23490993678569794),
 ('seek', 0.21646831929683685),
 ('but', 0.20729440450668335),
 ('terrified', 0.20406435430049896),
 ('identify', 0.19988307356834412),
 ('psitt', 0.19947919249534607),
 ('happens', 0.19912078976631165),
 ('syrup', 0.1978151500225067),
 ('butabarbital', 0.19764350354671478),
 ('solani', 0.19713544845581055)]

In [21]:
model300.most_similar('glucose')

[('defensive', 0.21428321301937103),
 ('portosystemic', 0.21238891780376434),
 ('wes', 0.2107398509979248),
 ('iadls', 0.2062404453754425),
 ('padv', 0.20236900448799133),
 ('aap', 0.1995161473751068),
 ('ipss', 0.19533191621303558),
 ('lidocain', 0.1941021978855133),
 ('piv', 0.191688671708107),
 ('estimation', 0.19055801630020142)]

In [22]:
model300.most_similar('acetaminophen')

[('auth', 0.23866820335388184),
 ('congolensis', 0.21769201755523682),
 ('told', 0.21700794994831085),
 ('bang', 0.20489785075187683),
 ('acidified', 0.19865907728672028),
 ('style', 0.19300800561904907),
 ('uea', 0.19146479666233063),
 ('now', 0.19122052192687988),
 ('euroglyphus', 0.186101034283638),
 ('liter', 0.18312163650989532)]

In [23]:
model300.most_similar('diabetes')

[('mcd', 0.23703092336654663),
 ('usu', 0.21397759020328522),
 ('danger', 0.20548413693904877),
 ('reacted', 0.20282073318958282),
 ('dilemma', 0.1979306936264038),
 ('remind', 0.19624178111553192),
 ('mozzarella', 0.1951010674238205),
 ('stores', 0.1924324482679367),
 ('unexpected', 0.19134823977947235),
 ('unusually', 0.1912660300731659)]

In [24]:
model300.docvecs.most_similar([model300.infer_vector(['diabetes'])])

[('16451-7', 0.6121147871017456),
 ('74150-4', 0.6081527471542358),
 ('66678-4', 0.6045306921005249),
 ('55399-0', 0.5851263403892517),
 ('56853-5', 0.5850776433944702),
 ('76654-3', 0.584358811378479),
 ('74151-2', 0.572638750076294),
 ('45636-8', 0.5639801621437073),
 ('5594-7', 0.5522032380104065),
 ('62787-7', 0.5516591668128967)]

In [25]:
similar = ['16451-7', '74150-4', '66678-4']
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
10707,16451-7,Beryllium [Presence] in Urine,LONG_COMMON_NAME,beryllium [presence] in urine
45409,74150-4,Diabetes Type I action plan,LONG_COMMON_NAME,diabetes type i action plan
47365,66678-4,Diabetes PhenX,SHORTNAME,diabetes phenx
105601,16451-7,Beryllium Ur Ql,SHORTNAME,beryllium ur ql
123600,74150-4,Diabetes type I action plan,SHORTNAME,diabetes type i action plan
151721,66678-4,Diabetes [PhenX],LONG_COMMON_NAME,diabetes [phenx]


In [26]:
model300.docvecs.most_similar([model300.infer_vector(['stroke'])])

[('70182-1', 0.687208354473114),
 ('8862-5', 0.6406380534172058),
 ('8863-3', 0.6285802721977234),
 ('8864-1', 0.627647340297699),
 ('70195-3', 0.6224223971366882),
 ('76297-1', 0.621720552444458),
 ('67521-5', 0.6214814782142639),
 ('70188-8', 0.6202735900878906),
 ('69523-9', 0.6122249960899353),
 ('76563-6', 0.6086527705192566)]

In [27]:
similar = ['70182-1', '8862-5', '8863-3']
df[df['label'].isin(similar)]

Unnamed: 0,label,text,source,text_normed
15641,8862-5,Right ventricular Stroke work,LONG_COMMON_NAME,right ventricular stroke work
56582,8863-3,LV Stroke work index,SHORTNAME,lv stroke work index
83969,70182-1,NIH Stroke Scale,LONG_COMMON_NAME,nih stroke scale
135785,8863-3,Left ventricular stroke work index,LONG_COMMON_NAME,left ventricular stroke work index
139920,8862-5,RV Stroke work,SHORTNAME,rv stroke work


### Save the model

In [28]:
model300.save('gensim/loinc-model-names-300.d2v')