### Introduction

Play with small sample of 300 LOINC long names. Test our own gs module (custom functions for gensim). Tune the model's hyper parameters.

In [1]:
from collections import Counter
import json
import pandas as pd
from pandas import DataFrame
from gensim.models.doc2vec import Doc2Vec
from gs import tokenize
from gs import tag_docs
from gs import test_model

In [2]:
df = pd.read_csv('data/loinc-labeled-text-names.csv')
df = df[df['source'] == 'LONG_COMMON_NAME']
df = df.sample(300)
df.sample(5)

Unnamed: 0,label,text,source,text_normed
63042,55938-5,Magnesium [Moles/volume] in Total parental nut...,LONG_COMMON_NAME,magnesium [moles/volume] in total parental nut...
66723,57855-9,Atenolol [Moles/volume] in Unspecified specimen,LONG_COMMON_NAME,atenolol [moles/volume] in unspecified specimen
37250,83557-9,Nurse practitioner Telephone encounter Note,LONG_COMMON_NAME,nurse practitioner telephone encounter note
129872,39368-6,Ankle - right X-ray AP and lateral W standing,LONG_COMMON_NAME,ankle - right x-ray ap and lateral w standing
71855,76117-1,Nitric oxide [VFr/PPres] Airway adaptor --duri...,LONG_COMMON_NAME,nitric oxide [vfr/ppres] airway adaptor --duri...


### Tokenize

In [3]:
text = df.iloc[0]['text']
print(text)
print(tokenize(text))

Fetal delivery information Document [US Standard Report of Fetal Death]
['fetal', 'delivery', 'information', 'document', 'us', 'standard', 'report', 'of', 'fetal', 'death']


### Tag documents

In [4]:
docs_tagged = tag_docs(df)
docs_tagged[:2]

[TaggedDocument(words=['fetal', 'delivery', 'information', 'document', 'us', 'standard', 'report', 'of', 'fetal', 'death'], tags=['76400-1']),
 TaggedDocument(words=['desmin', 'ag', 'presence', 'in', 'tissue', 'by', 'immune', 'stain'], tags=['10476-0'])]

### Builde models

In [5]:
params = []
min_counts = [1, 2]
sizes = [50, 100, 200, 300]
windows = [3, 5, 10]
samples = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
negatives = [1, 3, 5, 7]
dms = [0, 1]
for min_count in min_counts:
    for size in sizes:
        for window in windows:
            for sample in samples:
                for negative in negatives:
                    for dm in dms:
                        params.append({
                            'min_count': min_count,
                            'size': size,
                            'window': window,
                            'sample': sample,
                            'negative': negative,
                            'dm': dm
                        })

In [6]:
models = []
for param in params:
    models.append(Doc2Vec(min_count=param['min_count'], size=param['size'], window=param['window'],
                    sample=param['sample'], negative=param['negative'], dm=param['dm'],
                    workers=4, iter=100))

In [7]:
def train(df, docs_tagged, params, models):
    results = {}
    for i, model in enumerate(models):
        model.build_vocab(docs_tagged)
        model.train(docs_tagged)
        total, hits = test_model(df, model)
        results[json.dumps(params[i])] = hits
    return results

In [8]:
%time results = train(df, docs_tagged, params, models)

CPU times: user 32min 4s, sys: 6min 17s, total: 38min 21s
Wall time: 17min 29s


In [10]:
# Seems to be most sensitive to sample and negative
counter = Counter(results)
for result in counter.most_common(100):
    print(result)

('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 300, "dm": 0, "window": 3}', 209)
('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 300, "dm": 0, "window": 5}', 209)
('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 300, "dm": 0, "window": 10}', 205)
('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 200, "dm": 0, "window": 10}', 188)
('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 200, "dm": 0, "window": 3}', 188)
('{"negative": 1, "min_count": 1, "sample": 0.001, "size": 200, "dm": 0, "window": 5}', 186)
('{"negative": 7, "min_count": 1, "sample": 0.1, "size": 50, "dm": 0, "window": 3}', 182)
('{"negative": 7, "min_count": 1, "sample": 0.1, "size": 50, "dm": 0, "window": 5}', 181)
('{"negative": 7, "min_count": 1, "sample": 0.1, "size": 100, "dm": 0, "window": 3}', 173)
('{"negative": 7, "min_count": 1, "sample": 0.1, "size": 100, "dm": 0, "window": 10}', 173)
('{"negative": 7, "min_count": 1, "sample": 0.1, "size": 50, "dm": 0, "window