### Introduction

Play with a small sample of 2000 LOINC long names. Test our own gs module (custom functions for gensim). Tune the model's hyper parameters.

In [3]:
from collections import Counter
import json
import pandas as pd
from pandas import DataFrame
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tagdocs
from gs import listparams
from gs import listparams_sampling
from gs import train
from gs import evaluate

In [4]:
df = pd.read_csv('data/loinc-labeled-text-names.csv')
df = df[df['source'] == 'LONG_COMMON_NAME']
df = df.sample(2000)
df.sample(2)

Unnamed: 0,label,text,source,text_normed
115054,47497-3,Streptococcus pneumoniae 43 IgG Ab [Mass/volum...,LONG_COMMON_NAME,streptococcus pneumoniae 43 igg ab [mass/volum...
149850,33277-5,Acetaminophen+Phenacetin [Presence] in Urine b...,LONG_COMMON_NAME,acetaminophen+phenacetin [presence] in urine b...


### Tokenize

In [5]:
text = df.iloc[0]['text']
print(text)
print(tokenize(text))

Epstein Barr virus early diffuse IgG Ab [Units/volume] in Serum
['epstein', 'barr', 'virus', 'early', 'diffuse', 'igg', 'ab', 'units', 'volume', 'in', 'serum']


### Tag the documents

In [6]:
docs = tagdocs(df)
docs[:2]

[TaggedDocument(words=['epstein', 'barr', 'virus', 'early', 'diffuse', 'igg', 'ab', 'units', 'volume', 'in', 'serum'], tags=['50969-5']),
 TaggedDocument(words=['lutropin', 'moles', 'volume', 'in', 'serum', 'or', 'plasma', 'th', 'specimen', 'post', 'xxx', 'challenge'], tags=['12680-5'])]

### Builde the models

In [7]:
# Add params of no negative sampling and no down sampling
dms = [0, 1]
sizes = [50, 100, 200, 300]
windows = [3, 5, 7, 10]
mincounts = [1, 2]
params = listparams(dms=dms, sizes=sizes, windows=windows,
                    mincounts=mincounts, workers=4, epochs=100)

In [8]:
# Add params with negative sampling and down sampling
dms = [0, 1]
sizes = [50, 100, 200, 300]
windows = [3, 5, 7, 10]
mincounts = [1, 2]
samples = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
negatives = [1, 3, 5, 7]
hses = [0, 1]
params.extend(listparams_sampling(
    dms=dms, sizes=sizes, windows=windows, mincounts=mincounts,
    samples=samples, negatives=negatives, hses = hses,
    workers=4, epochs=100))

In [9]:
# Should be 3136
len(params)

3136

In [10]:
models = [Doc2Vec(**paramdict) for paramdict in params]

In [11]:
%time train(docs, models)

CPU times: user 5h 33min 34s, sys: 1h 34min 34s, total: 7h 8min 9s
Wall time: 8h 8min 39s


### Evaluate the models

In [12]:
results = {}
for p, model in zip(params, models):
    labels, hits = evaluate(df, model)
    results[json.dumps(p)] = len([hit for hit in hits if hit])

In [25]:
counter = Counter(results)
for result in counter.most_common(10):
    print(result)

('{"window": 5, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1983)
('{"window": 3, "min_count": 1, "size": 100, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1982)
('{"window": 10, "min_count": 1, "size": 200, "sample": 0.1, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1982)
('{"window": 7, "min_count": 1, "size": 300, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1982)
('{"window": 5, "min_count": 1, "size": 200, "sample": 0.1, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1982)
('{"window": 7, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1980)
('{"window": 5, "min_count": 1, "size": 100, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1980)
('{"window": 3, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 1, "iter": 

In [27]:
# hs=0
plist = [p for p in results if 'hs' in json.loads(p) and json.loads(p)['hs'] == 0]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"window": 3, "min_count": 1, "size": 50, "sample": 0.1, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1310)
('{"window": 10, "min_count": 1, "size": 50, "sample": 0.1, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1306)
('{"window": 7, "min_count": 1, "size": 50, "sample": 0.1, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1297)
('{"window": 5, "min_count": 1, "size": 50, "sample": 0.1, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1290)
('{"window": 3, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1289)
('{"window": 5, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1288)
('{"window": 10, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "negative": 7}', 1278)
('{"window": 7, "min_count": 1, "size": 50, "sample": 0.01, "dm": 0, "workers": 4, "hs": 0, "iter": 100, "

In [28]:
# dm=1
plist = [p for p in results if 'dm' in json.loads(p) and json.loads(p)['dm'] == 1]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"window": 3, "min_count": 1, "size": 50, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1750)
('{"window": 3, "min_count": 1, "size": 100, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1749)
('{"window": 3, "min_count": 1, "size": 200, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1734)
('{"window": 3, "min_count": 1, "size": 300, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1727)
('{"window": 5, "min_count": 1, "size": 200, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 1}', 1724)
('{"window": 3, "min_count": 1, "size": 100, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1721)
('{"window": 3, "min_count": 1, "size": 300, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter": 100, "negative": 3}', 1720)
('{"window": 3, "min_count": 1, "size": 200, "sample": 0.01, "dm": 1, "workers": 4, "hs": 1, "iter

In [29]:
# negative=0 and sample=0
plist = [p for p in results if 'negative' not in json.loads(p)]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"window": 7, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 914)
('{"window": 10, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 914)
('{"window": 5, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 913)
('{"window": 3, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 905)
('{"window": 5, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 100}', 883)
('{"window": 7, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 100}', 862)
('{"window": 3, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 100}', 856)
('{"window": 10, "min_count": 1, "dm": 0, "workers": 4, "iter": 100, "size": 100}', 854)
('{"window": 10, "min_count": 2, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 852)
('{"window": 3, "min_count": 2, "dm": 0, "workers": 4, "iter": 100, "size": 50}', 850)


In [30]:
df_models = DataFrame(counter.most_common())
df_models.to_csv('data/loinc-long-names-2000-models.csv', index=False, header=False)