### Introduction

Play with small sample of 300 LOINC long names. Test our own gs module (custom functions for gensim). Tune the model's hyper parameters.

In [1]:
from collections import Counter
import json
import pandas as pd
from pandas import DataFrame
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tagdocs
from gs import listparams
from gs import train
from gs import evaluate

In [2]:
df = pd.read_csv('data/loinc-labeled-text-names.csv')
df = df[df['source'] == 'LONG_COMMON_NAME']
df = df.sample(300)
df.sample(2)

Unnamed: 0,label,text,source,text_normed
113385,84390-4,Occupational medicine Outpatient Note,LONG_COMMON_NAME,occupational medicine outpatient note
80523,52867-9,State catchment area population [Estimated],LONG_COMMON_NAME,state catchment area population [estimated]


### Tokenize

In [3]:
text = df.iloc[0]['text']
print(text)
print(tokenize(text))

Boxelder IgE Ab [Ratio] in Serum
['boxelder', 'ige', 'ab', 'ratio', 'in', 'serum']


### Tag documents

In [4]:
docs = tagdocs(df)
docs[:2]

[TaggedDocument(words=['boxelder', 'ige', 'ab', 'ratio', 'in', 'serum'], tags=['61206-9']),
 TaggedDocument(words=['ciprofloxacin', 'ige', 'ab', 'units', 'volume', 'in', 'serum'], tags=['56702-4'])]

### Builde models

In [5]:
dms = [0, 1]
sizes = [50, 100, 200, 300]
windows = [3, 5, 7, 10]
mincounts = [1, 2]
samples = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
negatives = [1, 3, 5, 7]
params = listparams(dms=dms, sizes=sizes, windows=windows,
                    mincounts=mincounts, samples=samples,
                    negatives=negatives, workers=4, epochs=100)

In [6]:
models = [Doc2Vec(**paramdict)for paramdict in params]

In [7]:
%time train(docs, models)

CPU times: user 19min 9s, sys: 5min 47s, total: 24min 56s
Wall time: 18min 58s


In [10]:
results = {}
for p, model in zip(params, models):
    labels, hits = evaluate(df, model)
    results[json.dumps(p)] = len([hit for hit in hits if hit])

In [11]:
counter = Counter(results)
for result in counter.most_common(20):
    print(result)

('{"dm": 0, "min_count": 1, "size": 300, "window": 7, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 226)
('{"dm": 0, "min_count": 1, "size": 300, "window": 5, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 225)
('{"dm": 0, "min_count": 1, "size": 300, "window": 3, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 219)
('{"dm": 0, "min_count": 1, "size": 300, "window": 10, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 218)
('{"dm": 0, "min_count": 1, "size": 200, "window": 3, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 212)
('{"dm": 0, "min_count": 1, "size": 200, "window": 7, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 212)
('{"dm": 0, "min_count": 1, "size": 200, "window": 10, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 212)
('{"dm": 0, "min_count": 1, "size": 200, "window": 5, "iter": 100, "sample": 0.001, "negative": 1, "workers": 4}', 209)
('{"dm": 0, "min_count": 1, "size": 50