In [1]:
import gensim
import logging
import collections
import numpy as np
import os
import sys
sys.path.append('..')

from utilities import health_data
from utilities import configuration

config = configuration.get_config()

logging.basicConfig(filename=config['gensim_log'], 
                    format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)




In [2]:
train, testing = health_data.Admission.get_training_testing_data()

In [4]:
df = health_data.Admission.diagnosis_embeddings(train+testing, model_name='diag_conf_4')

Computing diagnosis embeddings ...


In [25]:
model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(config['gensim_model_folder'], 'diag_conf_12'))
model

<gensim.models.doc2vec.Doc2Vec at 0x3806f05b0>

In [26]:
import numpy as np

train_test = train+testing

loaded_model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(config['gensim_model_folder'], 'diag_conf_9'))
rng = np.random.default_rng(seed=7164770206861631272)

example_ix = rng.choice(range(len(train_test)), size=1)[0]
example_ix = rng.choice(range(len(train_test)), size=1)[0]
example_ix = rng.choice(range(len(train_test)), size=1)[0]
example_ix = rng.choice(range(len(train_test)), size=1)[0]


print(f'Selected instance: "{example_ix:,}"')
print(f'Vector representation for document: {str(loaded_model.infer_vector(train_test[example_ix].diagnosis.codes)[:5])[:-1]} ... ]')

diagnosis_mapping = health_data.Admission.get_diagnoses_mapping()
for code in train_test[example_ix].diagnosis.codes:
    print(f'{code:15}: {str(diagnosis_mapping[code])[:80]}')
print()

print('Document similar to:')
inferred_vector = loaded_model.infer_vector(train_test[example_ix].diagnosis.codes)
for doc_id, score in loaded_model.dv.most_similar([inferred_vector], topn=5):
    print(f'{doc_id:10}: {score:4.3f}')
print()


for doc_id, _ in loaded_model.dv.most_similar([inferred_vector], topn=20):
    print(f'Document {doc_id}:')
    for code in train_test[doc_id].diagnosis.codes:
        print(f'{code:15}: {str(diagnosis_mapping[code])[:80]}')
    print()

print(f'Common codes: {set(train_test[doc_id].diagnosis.codes).intersection(set(train_test[example_ix].diagnosis.codes))}')

Selected instance: "15,378"
Vector representation for document: [ 0.09563132 -0.24646254 -0.9265186   0.3144989  -0.29238382 ... ]
T8453          : {'Other complications of internal orthopaedic prosthetic devices, implants and g
U821           : {'Resistance to methicillin'}
Y831           : {'Other surgical procedures as the cause of abnormal reaction or later complicat
Z720           : {'Personal history of tobacco abuse', 'Drug use', 'Tobacco use'}

Document similar to:
     15378: 0.863
    194613: 0.720
    433199: 0.708
    169709: 0.692
     42806: 0.688

Document 15378:
T8453          : {'Other complications of internal orthopaedic prosthetic devices, implants and g
U821           : {'Resistance to methicillin'}
Y831           : {'Other surgical procedures as the cause of abnormal reaction or later complicat
Z720           : {'Personal history of tobacco abuse', 'Drug use', 'Tobacco use'}

Document 194613:
T8453          : {'Other complications of internal orthopaedic prostheti

In [29]:
model.wv['T8453']

array([-0.36471707, -0.21521375,  0.0995309 , -0.25640723, -0.7617536 ,
        0.37681812, -0.9136413 ,  0.39022243, -0.6875718 ,  0.10444845],
      dtype=float32)

In [47]:
model.wv.index2word

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [87]:
model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(config['gensim_model_folder'], 'diag_conf_9'))


In [99]:

ix = np.random.choice(range(len(model.wv)), size=1)[0]


target_code = model.wv.index_to_key[ix]
print(f'target_code={target_code} ({diagnosis_mapping[target_code]})')
for code, score in model.wv.similar_by_key(target_code):
    print(f'{code:17}: {diagnosis_mapping[code]}')

target_code=M759 ({'Shoulder lesion, unspecified site'})
M6652            : {'Spontaneous rupture of unspecified tendon, upper arm'}
S330             : {'Traumatic rupture of lumbar intervertebral disc'}
M6591            : {'Synovitis and tenosynovitis, unspecified, shoulder region'}
Y535             : {'Digestants causing adverse effect in therapeutic use'}
N500             : {'Atrophy of testis'}
N4500            : {'Epididymitis with abscess'}
M7985            : {'Other specified soft tissue disorders, pelvic region and thigh'}
O89404           : {'Spinal and epidural anaesthesia-induced headache during the puerperium, postpartum condition or comp'}
K614             : {'Intrasphincteric abscess'}
M2411            : {'Other articular cartilage disorders, shoulder region'}


In [8]:
train[0].diagnosis

Diagnosis(codes=['K8010', 'Z720'], texts=['Calculus of gallbladder with other cholecystitis without mention of obstruction', 'Tobacco use'], types=['M', '3'])

In [16]:
import pandas as pd
pd.DataFrame(np.hstack([admit_ids.reshape(-1,1),matrix]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.0,-0.063161,-0.248457,-0.130698,0.085834,0.028198,-0.201195,-0.281639,0.405608,-0.065389,...,0.015082,0.227208,-0.099438,-0.069197,-0.256359,0.066998,0.028146,-0.151062,-0.162603,-0.266917
1,1.0,0.413227,-0.911897,0.254760,0.742982,-0.316985,0.220133,-0.503356,-0.264264,0.092758,...,0.274872,0.451071,-0.037354,0.841485,-0.960242,-0.056485,-0.326613,0.095859,0.324437,-0.461155
2,2.0,-1.658434,-1.653236,1.028106,-0.871340,-0.447459,0.241639,-0.240287,0.016931,0.228959,...,-1.770697,1.732234,1.983951,-0.828193,0.410257,1.319523,-1.029812,1.725796,2.287525,1.430981
3,3.0,0.218941,-0.080762,-0.129898,-0.065770,0.270290,0.133499,-0.007811,-0.115128,-0.134931,...,0.188024,0.398111,0.324744,-0.273500,-0.096209,-0.011448,-0.071246,0.187279,-0.162962,-0.054224
4,4.0,-0.025808,-0.328370,-0.678773,-0.222535,0.211497,-1.260752,0.312203,0.693779,0.453954,...,1.055395,-0.014712,0.869090,-0.669505,0.202122,1.025176,-0.338622,0.032626,0.254966,-0.145903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245.0,-0.637477,-1.025891,1.191061,0.660186,-0.762446,0.650027,0.801506,0.520393,-0.138521,...,0.928185,0.369692,-0.194334,-0.194816,-0.692744,-0.203267,0.200329,0.763800,0.414888,-1.053449
246,246.0,-0.150698,-0.418549,-0.012652,-0.237934,0.174071,-0.165665,-0.330649,0.436257,-0.054562,...,-0.221336,0.083653,-0.151271,0.012909,0.031156,-0.216929,-0.180516,-0.357873,-0.204578,-0.138453
247,247.0,-0.281151,-0.577415,0.092399,-0.395507,0.248444,0.285106,-0.456880,-0.135471,0.053767,...,-0.113131,0.552096,-0.017792,-0.311864,-0.388670,0.140746,-0.073323,0.052377,-0.315365,-0.499371
248,248.0,0.074712,-0.153854,-0.027818,0.029288,0.021287,-0.035748,-0.057293,0.022977,0.018381,...,0.043534,0.119104,0.072128,-0.065186,-0.152789,0.122465,-0.015940,0.005998,0.013325,-0.209880


In [11]:
def get_diagnosis_generator(tokens_only=False):
    if tokens_only:
        return (admission.diagnosis.codes for admission in train)
    else:
        return (gensim.models.doc2vec.TaggedDocument(admission.diagnosis.codes, [ix]) 
                                                     for ix,admission in enumerate(train))
dianosis_training = list(get_diagnosis_generator())

In [4]:
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        pass
        # print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print(f"Epoch #{self.epoch}/{model.epochs}")
        self.epoch+=1



In [5]:
epoch_number=5
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, 
                                      min_count=2, 
                                      epochs=epoch_number, 
                                      )
model.build_vocab(dianosis_training)
model.wv['K8010']


array([ 0.00522794, -0.01538932,  0.01137994,  0.0151439 , -0.00510857,
       -0.00802645, -0.01921924,  0.00983716,  0.0058222 ,  0.01749465,
        0.0066006 , -0.00580401,  0.00711611,  0.01725426,  0.01846576,
        0.015289  , -0.0171082 , -0.01513529,  0.01452215, -0.01488672,
        0.00650162,  0.01604915, -0.00641403,  0.01351849, -0.01630941,
        0.0129769 ,  0.01118381,  0.00667422, -0.01624993,  0.01832152,
        0.01645675, -0.00312845,  0.00821485,  0.01274883, -0.00347945,
        0.010975  , -0.00806845, -0.00255003, -0.00241554, -0.00832321,
       -0.00141643, -0.01540412,  0.00728663,  0.018116  ,  0.00956351,
        0.01276748, -0.00947212,  0.00972408, -0.01336554, -0.00994097],
      dtype=float32)

In [6]:
model.train(dianosis_training, 
            total_examples=model.corpus_count, 
            epochs=epoch_number,
            callbacks=[EpochLogger()]
            )

Epoch #1/5
Epoch #2/5
Epoch #3/5
Epoch #4/5
Epoch #5/5


In [52]:

model_name = 'diag_conf_1'
model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(config['gensim_model_folder'], model_name))

random_sample_size=10000
rng = np.random.default_rng(seed=1299137873036141205)

random_sample_ix = rng.choice(range(len(dianosis_training)), size=random_sample_size, replace=False)
random_sample = [dianosis_training[ix] for ix in random_sample_ix]

considered_ranks=10

ranks = []
for doc_id, words in [(item.tags[0],item.words) for item in random_sample]:
    inferred_vector = model.infer_vector(words)
    sims = model.dv.most_similar([inferred_vector], topn=considered_ranks)
    if doc_id in [docid for docid, sim in sims]:
        rank = [docid for docid, sim in sims].index(doc_id)+1
    else:
        rank = -1
    ranks.append(rank)

freq = collections.Counter(ranks)
print(f'Rank;{model_name}')
for rank in range(-1,considered_ranks+1):
    if rank==0:
        continue
    if rank not in freq:
        freq[rank]=0
    print(f'{rank};{freq[rank]}')
                          

Rank;diag_conf_1
-1;8595
1;785
2;175
3;113
4;78
5;62
6;51
7;43
8;45
9;21
10;32


In [39]:

sorted(collections.Counter(ranks).items(), key=lambda x:int(x[0]))

[(-1, 406),
 (1, 490),
 (2, 20),
 (3, 20),
 (4, 10),
 (5, 9),
 (6, 5),
 (7, 10),
 (8, 5),
 (9, 3),
 (10, 5),
 (11, 2),
 (13, 1),
 (14, 3),
 (15, 3),
 (16, 1),
 (17, 3),
 (18, 2),
 (19, 1),
 (20, 1)]

258361

In [15]:
import collections

In [14]:
inferred_vector = model.infer_vector(random_sample[doc_id].words)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
rank = [docid for docid, sim in sims].index(doc_id)
ranks.append(rank)

In [17]:
print(doc_id)
sims[:20]

999


[(214549, 0.8077465295791626),
 (401862, 0.7366076707839966),
 (252168, 0.7119101881980896),
 (234414, 0.6901050209999084),
 (26843, 0.6899646520614624),
 (245430, 0.6748766899108887),
 (414156, 0.668571949005127),
 (66104, 0.6504617929458618),
 (254604, 0.6502533555030823),
 (149395, 0.6461442708969116),
 (123838, 0.6273596286773682),
 (154612, 0.6259123682975769),
 (34189, 0.6242959499359131),
 (139811, 0.6226596832275391),
 (356020, 0.618524968624115),
 (242409, 0.6158811450004578),
 (293933, 0.6156178116798401),
 (41133, 0.6154081225395203),
 (404, 0.6115264296531677),
 (281208, 0.6103981733322144)]

Counter({263174: 2, 46218: 2, 5640: 2, 61908: 1, 314075: 1, 63603: 1, 383707: 1, 339352: 1, 157694: 1, 133279: 1, 317545: 1, 217185: 1, 129301: 1, 44240: 1, 112924: 1, 406838: 1, 337967: 1, 273260: 1, 263575: 1, 86933: 1, 417067: 1, 247804: 1, 252781: 1, 299967: 1, 145708: 1, 134183: 1, 103664: 1, 83077: 1, 363585: 1, 400814: 1, 98421: 1, 75211: 1, 289604: 1, 266249: 1, 218048: 1, 375966: 1, 242416: 1, 138693: 1, 409726: 1, 35448: 1, 32194: 1, 60746: 1, 380642: 1, 61389: 1, 3647: 1, 98684: 1, 7767: 1, 14671: 1, 310117: 1, 385498: 1, 249268: 1, 6919: 1, 397782: 1, 75726: 1, 404284: 1, 389058: 1, 181556: 1, 232448: 1, 20415: 1, 140725: 1, 196257: 1, 418353: 1, 252314: 1, 213101: 1, 389719: 1, 62952: 1, 236151: 1, 73236: 1, 1470: 1, 184077: 1, 59417: 1, 381047: 1, 68203: 1, 350403: 1, 210610: 1, 335621: 1, 18183: 1, 325336: 1, 142296: 1, 318869: 1, 22628: 1, 309307: 1, 304284: 1, 205621: 1, 60027: 1, 287050: 1, 86239: 1, 99414: 1, 358461: 1, 76702: 1, 377432: 1, 379908: 1, 249321: 1, 3802

In [None]:
# model.compute_loss=True
# for epoch in range(1,1+epoch_number):
#     print(model.compute_loss)
#     model.train(dianosis_training, 
#                 total_examples=model.corpus_count, 
#                 epochs=1,
#                 )
#     print(f'Epoch {epoch:3}/{epoch_number} - loss: {model.get_latest_training_loss():.12f}')
#     print(model.compute_loss)
#     print()
# # model.save(config['daignosis_model'])


In [None]:
model.compute_loss

False

In [None]:
model.compute_loss

False

In [13]:
import numpy as np

loaded_model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(config['gensim_model_folder'], 'diag_conf_4'))
rng = np.random.default_rng(seed=7164770206861631271)

example_ix = rng.choice(range(len(train)), size=1)[0]
example_ix = rng.choice(range(len(train)), size=1)[0]
example_ix

print(f'Selected instance: "{example_ix:,}"')
print(f'Vector representation for document: {str(loaded_model.infer_vector(dianosis_training[example_ix].words)[:5])[:-1]} ... ]')

diagnosis_mapping = health_data.Admission.get_diagnoses_mapping()
for code in dianosis_training[example_ix].words:
    print(f'{code:15}: {str(diagnosis_mapping[code])[:80]}')
print()

print('Document similar to:')
inferred_vector = loaded_model.infer_vector(dianosis_training[example_ix].words)
for doc_id, score in loaded_model.dv.most_similar([inferred_vector], topn=5):
    print(f'{doc_id:10}: {score:4.3f}')
print()


doc_id, score = loaded_model.dv.most_similar([inferred_vector], topn=5)[0]
print(f'Document {doc_id}:')
for code in dianosis_training[doc_id].words:
    print(f'{code:15}: {str(diagnosis_mapping[code])[:80]}')
print()

 
print(f'Common codes: {set(dianosis_training[doc_id].words).intersection(set(dianosis_training[example_ix].words))}')

Selected instance: "37,636"
Vector representation for document: [ 0.01109267 -0.33016816 -0.15295595  0.6982465  -0.86605644 ... ]
E840           : {'Cystic fibrosis, unspecified', 'Cystic fibrosis with intestinal manifestations
E1364          : {'Type 2 diabetes mellitus with poor control, so described', 'Other specified di
B370           : {'Candidal stomatitis'}
B956           : {'Pneumonia, unspecified', 'Staphylococcus aureus as the cause of diseases class
B965           : {'Other bacterial infections of unspecified site', 'Pseudomonas (aeruginosa) as 

Document similar to:
     37636: 0.776
     23930: 0.564
     78562: 0.559
    195571: 0.526
    314772: 0.514

Document 37636:
E840           : {'Cystic fibrosis, unspecified', 'Cystic fibrosis with intestinal manifestations
E1364          : {'Type 2 diabetes mellitus with poor control, so described', 'Other specified di
B370           : {'Candidal stomatitis'}
B956           : {'Pneumonia, unspecified', 'Staphylococcus aureus as 

In [None]:
model

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

In [None]:
model.infer_vector(dianosis_training[example_ix].words).shape

(50,)

In [None]:
inferred_vector = model.infer_vector(dianosis_training[10].words)
sims = model.dv.most_similar([inferred_vector], topn=5)
sims

[(187664, 0.9398481249809265),
 (362666, 0.923190712928772),
 (340631, 0.9211763143539429),
 (306256, 0.9189252257347107),
 (183517, 0.9149316549301147)]

In [None]:
dianosis_training[10]

TaggedDocument(words=['C73', 'E063'], tags=[10])

In [None]:
dianosis_training[187664]

TaggedDocument(words=['O62001', 'O48001', 'O61001', 'Z37000'], tags=[187664])

In [None]:
for code in ['O62001', 'O48001', 'O61001', 'Z37000']:
    print(health_data.Admission.get_diagnoses_mapping()[code])

{'Primary inadequate contractions, delivered, with or without mention of antepartum condition'}
{'Prolonged pregnancy, delivered, with or without mention of antepartum condition'}
{'Failed medical induction of labour, delivered, with or without mention of antepartum condition'}
{'Single live birth, pregnancy resulting from both spontaneous ovulation and conception'}


In [None]:
for code in ['C73', 'E063']:
    print(health_data.Admission.get_diagnoses_mapping()[code])

{'Malignant neoplasm of thyroid gland'}
{'Autoimmune thyroiditis'}


In [None]:
from gensim import corpora
dictionary = corpora.Dictionary([admission.diagnosis.codes for admission in train])
dictionary



<gensim.corpora.dictionary.Dictionary at 0x2b9843790>

In [None]:
print(dictionary.token2id)

{'K8010': 0, 'Z720': 1, 'M2546': 2, 'T8404': 3, 'Y831': 4, 'E860': 5, 'I832': 6, 'M179': 7, 'R113': 8, 'R53': 9, 'Z501': 10, 'Z758': 11, 'Z950': 12, 'E1152': 13, 'I100': 14, 'I500': 15, 'J069': 16, 'R55': 17, 'Z8642': 18, 'I272': 19, 'J90': 20, 'K729': 21, 'K746': 22, 'R18': 23, 'E119': 24, 'F058': 25, 'M549': 26, 'Y450': 27, 'Y471': 28, 'Y521': 29, 'Y524': 30, 'Y530': 31, 'Z911': 32, 'J100': 33, 'J440': 34, 'I249': 35, 'J189': 36, 'N141': 37, 'N189': 38, 'Y544': 39, 'C73': 40, 'E063': 41, 'F104': 42, 'C241': 43, 'K831': 44, 'R112': 45, 'K571': 46, 'K650': 47, 'K8000': 48, 'K8050': 49, 'K838': 50, 'R33': 51, 'T8188': 52, 'Y836': 53, 'B962': 54, 'J449': 55, 'N390': 56, 'G20': 57, 'I460': 58, 'I4890': 59, 'J841': 60, 'R296': 61, 'Z515': 62, 'Z850': 63, 'C829': 64, 'F209': 65, 'J101': 66, 'T783': 67, 'U980': 68, 'X599': 69, 'B9681': 70, 'C259': 71, 'F059': 72, 'I64': 73, 'R630': 74, 'D649': 75, 'E1178': 76, 'N131': 77, 'N328': 78, 'N359': 79, 'R310': 80, 'E042': 81, 'T810': 82, 'J4590': 8

In [None]:
dictionary.doc2bow(train[100].diagnosis.codes)

[(7, 1), (226, 1)]

In [None]:
train[100].diagnosis.codes

['Z538', 'M179']

In [None]:
dictionary.token2id['M179']

7

In [None]:
model