In [141]:
from multiprocessing import Pool, cpu_count
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.models import doc2vec
from pprint import pprint
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import numpy as np
import random
import smart_open
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed


def train_vectors(path, local_context=150, name=''):
    model = doc2vec.Doc2Vec(corpus_file=path, vector_size=2, negative=5, sample=1e-5, dbow_words=1,
                               min_count=10, window=local_context, workers=4, epochs=30)
    model.train(corpus_file=path, total_words=model.corpus_count, epochs=model.epochs)
    model.save(name if name else 'trained_model{}'.format(local_context))

Load generated models

In [12]:
modeldm = doc2vec.Doc2Vec.load('/Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained')
modelcbow = doc2vec.Doc2Vec.load('/Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained_cbow')
concat_model = ConcatenatedDoc2Vec(models=[modeldm, modelcbow])
# modeldm50 = doc2vec.Doc2Vec.load('/Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model50_trained')
# modelcbow50 = doc2vec.Doc2Vec.load('/Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model50_trained_cbow')
# concat_model50 = ConcatenatedDoc2Vec(models=[modeldm50, modelcbow50])

2019-03-23 12:57:26,493 : INFO : loading Doc2Vec object from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained
2019-03-23 12:57:34,923 : INFO : loading vocabulary recursively from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained.vocabulary.* with mmap=None
2019-03-23 12:57:34,924 : INFO : loading docvecs recursively from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained.docvecs.* with mmap=None
2019-03-23 12:57:34,925 : INFO : loading vectors_docs from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained.docvecs.vectors_docs.npy with mmap=None
2019-03-23 12:57:35,657 : INFO : loading wv recursively from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained.wv.* with mmap=None
2019-03-23 12:57:35,659 : INFO : loading trainables recursively from /Users/michalkukielka/Desktop/licencjat/results/doc2vec_data/model150_trained.trainables.* with mmap=None
2019-0

ICD10 codes

In [44]:
def get_icd10_codes():
    icd10 = defaultdict()
    with open('../results/icd10cm_codes_2018.txt', 'r+') as f:
        for line in f.readlines():
            line = line.split()
            icd10[line[0].lower()] = ' '.join(line[1:]).lower()
        return icd10

Get most similar words for code

In [81]:
def most_similar_for_icd10(model, topn, icd10):
    """Find most similar for words for icd10 codes"""
    results = defaultdict()
    for code in icd10.keys():
        try:
            results[code] = modelcbow.wv.most_similar(code, topn=topn)
        except KeyError:
            pass
    return results


def validate_relations(similarity_results):
    """Filter found words by cheking, whether they're valid icd10 codes."""
    found_relations = defaultdict(list)
    for reference, codes in tqdm(similarity_results.items()):
        for code, score in codes:
            if code.lower() in icd10.keys():
                found_relations[reference].append(code.lower())
    return found_relations


def describe_relations(found_relations):
    """Annotate every relation with titles of contained codes"""
    described_relations = defaultdict(list)
    for reference, relations in found_relations.items():
        for relation in relations:
            described_relations[(reference, relation)].append(
                (icd10[reference], icd10[relation]))
    return described_relations


def perform_analysis(model, topn=50):
    """Find relations between icd10 codes."""
    icd10 = get_icd10_codes()
    similarity_results = most_similar_for_icd10(model=concat_model, topn=topn, icd10=icd10)
    validated_relations = validate_relations(similarity_results)
    return describe_relations(validated_relations)

In [92]:
results100 = perform_analysis(concat_model, topn=100)
results500 = perform_analysis(concat_model, topn=500)
results1000 = perform_analysis(concat_model, topn=1000)

100%|██████████| 1770/1770 [00:00<00:00, 11972.40it/s]
100%|██████████| 1770/1770 [00:00<00:00, 2514.47it/s]
100%|██████████| 1770/1770 [00:01<00:00, 1329.68it/s]


Results does not differ between models. Only choosing topn similar words matters in this case.

In [94]:
len(results100.items()), len(results500.items()), len(results1000.items())

(262, 1320, 2705)

Saving found relations

In [109]:
with open('found_relations500', 'w+') as output_file:  
    for keys, titles in results500.items():
        output_file.write('{}:\n'.format(' '.join(keys)))
        for title in titles:
            output_file.write('    {}\n'.format(' || '.join(title)))

Generating graphs for found relations with networkx

In [110]:
import networkx as nx

In [119]:
graph = nx.DiGraph()

In [120]:
graph.add_edges_from(results1000.keys())

In [121]:
nx.write_edgelist(graph, 'graph1000')

Retrieve most similar documents with titles of icd10 codes

In [144]:
def get_docs_for_ids(path, ids):
    """Retrieves documents with given ids."""
    if not isinstance(ids, list):
        ids = list(ids)
    docs = []
    with smart_open.smart_open(path, 'r+') as f:
        for index, tokens in enumerate(f):
            if index in ids:
                docs.append('{}: {}'.format(index , tokens))
    return docs


def most_similar_docs(path, model, doc_id, topn):
    """Return most similar docs for given document's id."""
    sims = model.docvecs.most_similar(doc_id, topn=topn)
    ids = list(map(lambda x: x[0], sims))
    return get_docs_for_ids(path, ids)


def get_docs_for_icd10_code(path, model, key, title, topn):
    vector = modelcbow.infer_vector(title.lower().split())
    return {(key, title): most_similar_docs(path=path, model=model, doc_id=[vector], topn=topn)}


def get_docs_for_icd10_codes(model, path='../results/corpus.txt', topn=50):
    """Retrieves most similar documents for icd10 codes."""
    icd10 = get_icd10_codes()
    return Parallel(n_jobs=-1, backend='threading', verbose=50)(
        delayed(get_docs_for_icd10_code)(path, model, key, title, topn) for key, title in icd10.items())

In [None]:
inferred_titles = get_docs_for_icd10_codes(modelcbow)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
