### Introduction

LOINC all text columns but only 2000 random samples for parameter tuning

In [16]:
from functools import reduce
from collections import Counter
import json
import pandas as pd
from pandas import DataFrame
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tagdocs
from gs import listparams
from gs import listparams_sampling
from gs import train
from gs import evaluate

In [2]:
df = pd.read_csv('data/loinc-labeled-text-all-concatenated.csv')
df = df.sample(2000)
df.sample(2)

Unnamed: 0,text,label
62136,clot strength heparinase bld teg clot strength...,66752-7
44670,cd15 cells act/nor bld cd15 cells actual/norma...,50783-0


### Tag the documents

In [3]:
docs = tagdocs(df)
docs[:2]

[TaggedDocument(words=['polys', 'leuk', 'nfr', 'fld', 'cells', 'leukocytes', 'in', 'body', 'fluid', 'wbc', 'cellularity', 'pmn', 'random', 'body', 'fluid', 'unsp', 'white', 'blood', 'cell', 'quan', 'leukocyte', 'hematology', 'cell', 'counts', 'poly', 'flu', 'point', 'in', 'time', 'leuc', 'wbc', 'body', 'fluid', 'bf', 'fld', 'quant', 'polys', 'wbcs', 'fl', 'bod', 'bodies', 'cells', 'leukocytes', 'cell', 'white', 'blood', 'cells', 'qnt', 'fluid', 'number', 'fraction', 'quantitative', 'percent', 'lkcs', 'segmented', 'wbc', 'leuk', 'body', 'fld', 'hem', 'bc'], tags=['26518-1']),
 TaggedDocument(words=['ur', 'mcnc', 'mass', 'volume', 'in', 'urine', 'drugs', 'ur', 'qnt', 'quantitative', 'nbu', 'point', 'in', 'time', 'drug', 'tox', 'quan', 'random', 'quant', 'level', 'mass', 'concentration', 'ua', 'buprenorphine', 'metabolite', 'drug', 'toxicology', 'urine', 'urn', 'norbup'], tags=['49753-7'])]

### Build the models

In [4]:
# Add params of no negative sampling and no down sampling
dms = [0, 1]
sizes = [50, 100, 200, 300]
windows = [3, 5, 7, 10]
mincounts = [1, 2]
params = listparams(dms=dms, sizes=sizes, windows=windows,
                    mincounts=mincounts, workers=4, epochs=100)

In [5]:
# Add params with negative sampling and down sampling
dms = [0, 1]
sizes = [50, 100, 200, 300]
windows = [3, 5, 7, 10]
mincounts = [1, 2]
samples = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
negatives = [1, 3, 5, 7]
hses = [0, 1]
params.extend(listparams_sampling(
    dms=dms, sizes=sizes, windows=windows, mincounts=mincounts,
    samples=samples, negatives=negatives, hses = hses,
    workers=4, epochs=100))

In [6]:
# Should be 3136
len(params)

3136

In [7]:
models = [Doc2Vec(**paramdict) for paramdict in params]

In [8]:
%time train(docs, models)

CPU times: user 18h 5min 59s, sys: 1h 42min 40s, total: 19h 48min 39s
Wall time: 1d 30min 16s


### Evaluate the models

In [9]:
results = {}
for p, model in zip(params, models):
    labels, hits = evaluate(df, model)
    results[json.dumps(p)] = reduce(lambda x,y: x+y, hits)

In [10]:
counter = Counter(results)
for result in counter.most_common(10):
    print(result)

('{"negative": 1, "window": 10, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.01, "size": 100, "workers": 4, "dm": 1}', 2000)
('{"negative": 7, "window": 10, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.01, "size": 50, "workers": 4, "dm": 1}', 2000)
('{"negative": 7, "window": 3, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.001, "size": 200, "workers": 4, "dm": 1}', 2000)
('{"negative": 5, "window": 10, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.01, "size": 100, "workers": 4, "dm": 1}', 2000)
('{"negative": 7, "window": 3, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.1, "size": 300, "workers": 4, "dm": 1}', 2000)
('{"negative": 1, "window": 5, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.001, "size": 50, "workers": 4, "dm": 1}', 2000)
('{"negative": 1, "window": 7, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.001, "size": 300, "workers": 4, "dm": 1}', 2000)
('{"negative": 5, "window": 7, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.1, "size": 30

In [11]:
# hs=0
plist = [p for p in results if 'hs' in json.loads(p) and json.loads(p)['hs'] == 0]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"negative": 7, "window": 3, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.1, "size": 100, "workers": 4, "dm": 0}', 1935)
('{"negative": 7, "window": 5, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.0001, "size": 200, "workers": 4, "dm": 0}', 1931)
('{"negative": 7, "window": 5, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.1, "size": 50, "workers": 4, "dm": 0}', 1930)
('{"negative": 7, "window": 10, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.0001, "size": 300, "workers": 4, "dm": 0}', 1929)
('{"negative": 7, "window": 5, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.001, "size": 50, "workers": 4, "dm": 0}', 1927)
('{"negative": 7, "window": 5, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.0001, "size": 100, "workers": 4, "dm": 0}', 1927)
('{"negative": 7, "window": 7, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.1, "size": 50, "workers": 4, "dm": 0}', 1926)
('{"negative": 7, "window": 3, "hs": 0, "min_count": 1, "iter": 100, "sample": 0.1, "size": 50,

In [12]:
# dm=1
plist = [p for p in results if 'dm' in json.loads(p) and json.loads(p)['dm'] == 1]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"negative": 7, "window": 5, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.1, "size": 100, "workers": 4, "dm": 1}', 2000)
('{"negative": 1, "window": 10, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.01, "size": 100, "workers": 4, "dm": 1}', 2000)
('{"negative": 5, "window": 3, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.001, "size": 50, "workers": 4, "dm": 1}', 2000)
('{"negative": 7, "window": 3, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.001, "size": 100, "workers": 4, "dm": 1}', 2000)
('{"negative": 3, "window": 7, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.01, "size": 200, "workers": 4, "dm": 1}', 2000)
('{"negative": 1, "window": 5, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.1, "size": 300, "workers": 4, "dm": 1}', 2000)
('{"negative": 7, "window": 10, "hs": 1, "min_count": 1, "iter": 100, "sample": 0.01, "size": 50, "workers": 4, "dm": 1}', 2000)
('{"negative": 1, "window": 3, "hs": 1, "min_count": 2, "iter": 100, "sample": 0.1, "size": 100, 

In [13]:
# negative=0 and sample=0
plist = [p for p in results if 'negative' not in json.loads(p)]
counter = Counter()
for p in plist:
    counter[p] = results[p]
for result in counter.most_common(10):
    print(result)

('{"window": 3, "min_count": 1, "iter": 100, "size": 300, "workers": 4, "dm": 0}', 1915)
('{"window": 10, "min_count": 1, "iter": 100, "size": 100, "workers": 4, "dm": 0}', 1910)
('{"window": 3, "min_count": 1, "iter": 100, "size": 50, "workers": 4, "dm": 0}', 1909)
('{"window": 5, "min_count": 1, "iter": 100, "size": 200, "workers": 4, "dm": 0}', 1908)
('{"window": 7, "min_count": 1, "iter": 100, "size": 50, "workers": 4, "dm": 0}', 1907)
('{"window": 10, "min_count": 1, "iter": 100, "size": 300, "workers": 4, "dm": 0}', 1907)
('{"window": 10, "min_count": 1, "iter": 100, "size": 200, "workers": 4, "dm": 0}', 1906)
('{"window": 5, "min_count": 1, "iter": 100, "size": 100, "workers": 4, "dm": 0}', 1905)
('{"window": 3, "min_count": 1, "iter": 100, "size": 100, "workers": 4, "dm": 0}', 1905)
('{"window": 10, "min_count": 1, "iter": 100, "size": 50, "workers": 4, "dm": 0}', 1903)


In [17]:
counter = Counter(results)
df_models = DataFrame(counter.most_common())
df_models.to_csv('data/loinc-all-2000-models.csv', index=False, header=False)