In [1]:
from pprint import pprint
import numpy as np
import random
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx

In [2]:
def get_icd10_codes():
    icd10 = defaultdict()
    with open('../results/icd10cm_codes_2018.txt', 'r+') as f:
        for line in f.readlines():
            line = line.split()
            icd10[line[0].lower()] = ' '.join(line[1:]).lower()
        return icd10

def retrieve_titles_for_subgraph(graph, icd10, min_nodes=3):
    subgraphs = list(nx.connected_component_subgraphs(graph))
    results = defaultdict(list)
    for subgraph in subgraphs:
        nodes = subgraph.nodes()
        if len(nodes) >= min_nodes:
            results[', '.join(nodes.keys())] = [icd10[node] for node in nodes]
    return results

GloVe

In [4]:
glove50 = nx.read_edgelist('glove_analysis/50/graph_glove50_1000')
glove150 = nx.read_edgelist('glove_analysis/150/graph_glove_1000')

In [5]:
described_glove50 = retrieve_titles_for_subgraph(glove50, get_icd10_codes())
described_glove150 = retrieve_titles_for_subgraph(glove150, get_icd10_codes())

In [7]:
len(described_glove50), len(described_glove150)

(241, 238)

In [8]:
len(set(described_glove50.keys()).intersection(described_glove150.keys()))

0

dbow

In [8]:
doc2vec50 = nx.read_edgelist('doc2vec_analysis/50/graph_1000')
doc2vec = nx.read_edgelist('doc2vec_analysis/150/graph_1000')

In [9]:
described_doc2vec50 = retrieve_titles_for_subgraph(doc2vec50, get_icd10_codes())
described_doc2vec150 = retrieve_titles_for_subgraph(doc2vec, get_icd10_codes())

In [10]:
len(described_doc2vec50), len(described_doc2vec150)

(228, 228)

In [17]:
len(set(described_doc2vec50).intersection(set(described_doc2vec150)))

211

dm

In [12]:
doc2vec50_dm = nx.read_edgelist('doc2vec_analysis/dm/50/graph_1000')
doc2vec_dm = nx.read_edgelist('doc2vec_analysis/dm/150/graph_1000')

In [13]:
described_doc2vec50_dm = retrieve_titles_for_subgraph(doc2vec50_dm, get_icd10_codes())
described_doc2vec150_dm = retrieve_titles_for_subgraph(doc2vec_dm, get_icd10_codes())

In [24]:
len(described_doc2vec50_dm.keys()), len(described_doc2vec150_dm.keys())

(234, 249)

Overlap between models

In [71]:
def get_overlap(a, b):
    results = defaultdict(list)
    for key1 in a.keys():
        codes = set(key1.split(', '))
        codes_intersection = [codes.intersection(key2.split(', ')) for key2 in b.keys() if len(codes.intersection(key2.split(', '))) > 2]
        if codes_intersection:
            results[key1] = codes_intersection
    return results

In [72]:
glove_overlap = get_overlap(described_glove50, described_glove150)
glove_overlap

defaultdict(list,
            {'a209, n08, k282, a240, p540, d179, l138, b182, g518, g219, p139, p371, r42, r401, r290, q263, a233, e311': [{'b182',
               'g219',
               'r42'}],
             'k77, a054, e275, l80, e321, b150, r450, c169, f340, c189, b03, e214, c300': [{'b150',
               'c169',
               'c189'}],
             'm130, b873, y753, d271, k810, h6000': [{'b873', 'd271', 'y753'}],
             'n420, n913, d471, n398, p0700, b169, p0736': [{'d471',
               'n420',
               'p0736'}],
             'p220, p190, k30, c140, r17': [{'c140', 'k30', 'p190'}]})

In [73]:
doc2vec_overlap = get_overlap(described_doc2vec50_dm, described_doc2vec150_dm)
doc2vec_overlap

defaultdict(list,
            {'c165, q140, p550, d023, n393, g729, c384, r239': [{'d023',
               'g729',
               'q140'}],
             'f341, n806, n412, f79, n171, n760, n649, r067, p100, n318': [{'f341',
               'n412',
               'n806'}],
             'n359, a238, e800, p260, n403, n926, d71, l605, h5332, n158, n184, n365, n182, n804, d010': [{'e800',
               'l605',
               'n182'}]})

In [74]:
overlap = {**doc2vec_overlap, **glove_overlap}

In [75]:
overlap

{'a209, n08, k282, a240, p540, d179, l138, b182, g518, g219, p139, p371, r42, r401, r290, q263, a233, e311': [{'b182',
   'g219',
   'r42'}],
 'c165, q140, p550, d023, n393, g729, c384, r239': [{'d023', 'g729', 'q140'}],
 'f341, n806, n412, f79, n171, n760, n649, r067, p100, n318': [{'f341',
   'n412',
   'n806'}],
 'k77, a054, e275, l80, e321, b150, r450, c169, f340, c189, b03, e214, c300': [{'b150',
   'c169',
   'c189'}],
 'm130, b873, y753, d271, k810, h6000': [{'b873', 'd271', 'y753'}],
 'n359, a238, e800, p260, n403, n926, d71, l605, h5332, n158, n184, n365, n182, n804, d010': [{'e800',
   'l605',
   'n182'}],
 'n420, n913, d471, n398, p0700, b169, p0736': [{'d471', 'n420', 'p0736'}],
 'p220, p190, k30, c140, r17': [{'c140', 'k30', 'p190'}]}

In [80]:
icd10 = get_icd10_codes()
with open('overlap', 'w+') as output_file:  
    for _, codes in overlap.items():
        output_file.write('{}:\n'.format(', '.join(codes[0])))
        output_file.write('{}\n'.format('\n'.join([icd10[code] for code in codes[0]])))
        output_file.write('\n')