In [5]:
import numpy as np
import random
from collections import defaultdict, Counter
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pandas as pd

In [2]:
def get_icd10_codes():
    icd10 = defaultdict()
    with open('../results/icd10cm_codes_2018.txt', 'r+') as f:
        for line in f.readlines():
            line = line.split()
            icd10[line[0].lower()] = ' '.join(line[1:]).lower()
        return icd10

icd10_codes = get_icd10_codes()
    
def retrieve_titles_for_subgraph(graph, icd10, min_nodes=2):
    subgraphs = list(nx.connected_component_subgraphs(graph))
    results = defaultdict(list)
    for subgraph in subgraphs:
        nodes = subgraph.nodes()
        if len(nodes) >= min_nodes:
            results[', '.join(nodes.keys())] = [icd10[node] for node in nodes]
    return results

In [54]:
def get_overlap(graph1, graph2):
    results = defaultdict(int)
    for i in graph1:
        codes = set(i.split(', '))
        for key in graph2:
            overlapping_codes = tuple(codes.intersection(key.split(', ')))
            if len(overlapping_codes) >= 2:
                results[tuple(sorted(overlapping_codes))] += 1
    return results

GloVe

In [6]:
glove50_2 = nx.read_edgelist('glove_analysis/dimension_2/50/graph_glove_100')
glove50_10 = nx.read_edgelist('glove_analysis/dimension_10/50/graph_glove_100')
glove150_2 = nx.read_edgelist('glove_analysis/dimension_2/150/graph_glove_100')
glove150_10 = nx.read_edgelist('glove_analysis/dimension_10/150/graph_glove_100')

In [7]:
described_glove50_2 = retrieve_titles_for_subgraph(glove50_2, icd10_codes)
described_glove50_10 = retrieve_titles_for_subgraph(glove50_10, icd10_codes)
described_glove150_2 = retrieve_titles_for_subgraph(glove150_2, icd10_codes)
described_glove150_10 = retrieve_titles_for_subgraph(glove150_10, icd10_codes)

In [None]:
# amount of relations
for i in graphs:
    print('{} / {} = {}'.format(
        sum([1 for codes in i if len(set(k[0] for k in codes.split(', '))) == 1]), len(i), sum([1 for codes in i if len(set(k[0] for k in codes.split(', '))) == 1]) / len(i)))

In [None]:
get_overlap(described_glove50, described_doc2vec50)

Doc2vec

In [8]:
doc2vec50_2 = nx.read_edgelist('doc2vec_analysis/cbow/50/graph_100')
doc2vec150_2 = nx.read_edgelist('doc2vec_analysis/cbow/150/graph_100')
doc2vec50_10 = nx.read_edgelist('doc2vec_analysis/dimension10/50/graph_100')
doc2vec150_10 = nx.read_edgelist('doc2vec_analysis/dimension10/150/graph_100')

In [9]:
described_doc2vec50_2 = retrieve_titles_for_subgraph(doc2vec50_2, get_icd10_codes())
described_doc2vec150_2 = retrieve_titles_for_subgraph(doc2vec150_2, get_icd10_codes())
described_doc2vec50_10 = retrieve_titles_for_subgraph(doc2vec50_10, get_icd10_codes())
described_doc2vec150_10 = retrieve_titles_for_subgraph(doc2vec150_10, get_icd10_codes())

In [10]:
graphs = [described_glove50_2, described_glove50_10, 
          described_glove150_2, described_glove150_10,
          described_doc2vec50_2, described_doc2vec50_10,
          described_doc2vec150_2, described_doc2vec150_10]

names = ['GloVe: 50x2', 'GloVe: 50x10', 'GloVe: 150x2', 'GloVe: 150x10',
         'Doc2vec: 50x2', 'Doc2vec: 50x10', 'Doc2vec: 150x2', 'Doc2vec: 150x10']

In [34]:
def statistics(graph, name):
    subgraphs_length = [len(i.split(', ')) for i in graph]
    length, average_length, median_length = len(graph), np.average(subgraphs_length), np.median(subgraphs_length)
    hits = sum([max(Counter(code[0] for code in relation.split(', ')).values()) > 2 for relation in graph])
    return [name, length, average_length, median_length, hits]

In [35]:
results = pd.DataFrame([statistics(graph, name) for graph, name in zip(graphs, names)], 
                       columns=['model', 'no relations', 'average length', 'median length', 'same category'])

In [36]:
results

Unnamed: 0,model,no relations,average length,median length,same category
0,GloVe: 50x2,151,2.13245,2.0,0
1,GloVe: 50x10,200,3.735,2.0,22
2,GloVe: 150x2,150,2.16,2.0,1
3,GloVe: 150x10,185,3.697297,2.0,23
4,Doc2vec: 50x2,143,2.111888,2.0,0
5,Doc2vec: 50x10,179,3.826816,2.0,9
6,Doc2vec: 150x2,143,2.111888,2.0,0
7,Doc2vec: 150x10,185,3.67027,2.0,7


In [55]:
overlap = np.zeros([8, 8])
for index, graph in enumerate(graphs):
    overlap[index] = [sum(get_overlap(graph, graph1).values()) for graph1 in graphs]    

In [56]:
pd.DataFrame(overlap, columns=names, index=names)

Unnamed: 0,GloVe: 50x2,GloVe: 50x10,GloVe: 150x2,GloVe: 150x10,Doc2vec: 50x2,Doc2vec: 50x10,Doc2vec: 150x2,Doc2vec: 150x10
GloVe: 50x2,151.0,7.0,1.0,8.0,0.0,2.0,0.0,6.0
GloVe: 50x10,7.0,200.0,7.0,97.0,2.0,16.0,2.0,20.0
GloVe: 150x2,1.0,7.0,150.0,7.0,0.0,3.0,0.0,2.0
GloVe: 150x10,8.0,97.0,7.0,185.0,1.0,17.0,1.0,21.0
Doc2vec: 50x2,0.0,2.0,0.0,1.0,143.0,0.0,143.0,1.0
Doc2vec: 50x10,2.0,16.0,3.0,17.0,0.0,179.0,0.0,57.0
Doc2vec: 150x2,0.0,2.0,0.0,1.0,143.0,0.0,143.0,1.0
Doc2vec: 150x10,6.0,20.0,2.0,21.0,1.0,57.0,1.0,185.0


Prepare file with filtered, described relations for all models

In [81]:
found_relations = defaultdict(list)
for graph in graphs:
    for key, titles in graph.items(): 
        sorted_item = sorted(zip(key.split(', '), titles), key=lambda x: x[0])
        codes = tuple(map(lambda x: x[0], sorted_item))
#         if 2 < len(codes) < 15 and len(set(code[0] for code in codes)) > 1:
        found_relations[codes] = tuple(map(lambda x: x[1], sorted_item))

In [82]:
len(found_relations)

1166

In [77]:
with open('described_relations', 'w+') as output_file:  
    for keys, titles in found_relations.items():
        output_file.write('{}:\n'.format(', '.join(keys)))
        output_file.write('{}\n'.format('\n'.join(titles)))
        output_file.write('\n')

In [78]:
sum(len(i) for i in graphs)

1336