In [None]:
from pprint import pprint
import numpy as np
import random
import smart_open
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx

Glove evaluation refactored

In [None]:
def generate_base(model_path, vocab_path):
    with open(vocab_path, 'r+') as f:
        words = [x.rstrip().split(' ')[0] for x in f.readlines()]
    with open(model_path, 'r') as f:
        vectors = {}
        for line in f:
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]

    vocab_size = len(words)
    vocab = {w: idx for idx, w in enumerate(words)}
    ivocab = {idx: w for idx, w in enumerate(words)}

    vector_dim = len(vectors[ivocab[0]])
    W = np.zeros((vocab_size, vector_dim))
    for word, v in vectors.items():
        if word == '<unk>':
            continue
        W[vocab[word], :] = v

    # normalize each word vector to unit variance
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T
    return (W_norm, vocab, ivocab)


def distance(base, input_term, topn):
    W, vocab, ivocab = base
    for idx, term in enumerate(input_term.split(' ')):
        if term in vocab:
            if idx == 0:
                vec_result = np.copy(W[vocab[term], :])
            else:
                vec_result += W[vocab[term], :]
        else:
            return # Word: Out of dictionary!

    vec_norm = np.zeros(vec_result.shape)
    d = (np.sum(vec_result ** 2,) ** (0.5))
    vec_norm = (vec_result.T / d).T

    dist = np.dot(W, vec_norm.T)

    for term in input_term.split(' '):
        index = vocab[term]
        dist[index] = -np.Inf

    a = np.argsort(-dist)[:topn]
    return [[ivocab[x], dist[x]] for x in a]

Get most similar words for code

In [None]:
def get_icd10_codes():
    icd10 = defaultdict()
    with open('../results/icd10cm_codes_2018.txt', 'r+') as f:
        for line in f.readlines():
            line = line.split()
            icd10[line[0].lower()] = ' '.join(line[1:]).lower()
        return icd10
    
    
def validate_relations(similarity_results, icd10):
    """Filter found words by cheking, whether they're valid icd10 codes."""
    found_relations = defaultdict(list)
    for reference, codes in tqdm(similarity_results.items()):
        for code, score in codes:
            if code.lower() in icd10.keys():
                found_relations[reference].append(code.lower())
    return found_relations


def describe_relations(found_relations, icd10):
    """Annotate every relation with titles of contained codes"""
    described_relations = defaultdict(list)
    for reference, relations in found_relations.items():
        for relation in relations:
            described_relations[(reference, relation)].append(
                (icd10[reference], icd10[relation]))
    return described_relations


def perform_analysis(model_path, vocab_path, topn=50):
    """Find relations between icd10 codes."""
    icd10 = get_icd10_codes()
    print('Loading vocabulary and vectors...')
    base = generate_base(model_path, vocab_path)
    print('Retrieving similar words...')
    similarity = Parallel(n_jobs=-1, backend='threading', verbose=10)(
        delayed(distance)(base, code, topn) for code in icd10.keys())
    similarity_results = defaultdict(list)
    for code, relations in zip(icd10.keys(), similarity):
        if relations:
            similarity_results[code] = relations
    print('Filtering similarities...')
    validated_relations = validate_relations(similarity_results, icd10=icd10)
    print('Describing relations...')
    return describe_relations(validated_relations, icd10=icd10)

In [None]:
results = perform_analysis('../results/glove_data/symmetric_vectors/glove_vectors150.txt', '../results/filtered_vocab_10.txt', topn=50)

Create network for found relations

In [None]:
graph = nx.Graph()

In [None]:
graph.add_edges_from(results.keys())

In [None]:
nx.write_edgelist(graph, 'glove_analysis/dimension_2/150/graph_glove_50')

Load graph

In [None]:
graph = nx.read_edgelist('glove_analysis/dimension_2/150/graph_glove_50')

Analysing found relations

In [None]:
icd10 = get_icd10_codes()

In [None]:
def retrieve_titles_for_subgraph(graph, icd10, min_nodes=2):
    subgraphs = list(nx.connected_component_subgraphs(graph))
    results = defaultdict(list)
    for subgraph in subgraphs:
        nodes = subgraph.nodes()
        if len(nodes) >= min_nodes:
            results[', '.join(nodes.keys())] = [icd10[node] for node in nodes]
    return results

In [None]:
results_titles = retrieve_titles_for_subgraph(graph, icd10)

In [None]:
with open('glove_analysis/dimension_2/150/described_relations_glove_50', 'w+') as output_file:  
    for keys, titles in results_titles.items():
        output_file.write('{}:\n'.format(keys))
        output_file.write('{}\n'.format('\n'.join(titles)))
        output_file.write('\n')